diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala new file mode 100644 index 0000000000..cc184a2ee5 --- /dev/null +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op.stages.impl.feature + +import org.apache.spark.ml.attribute.AttributeGroup +import org.apache.spark.sql.types.StructField +import org.scalatest.{Assertion, Matchers} + +trait AttributeAsserts { + self: Matchers => + /** + * Assert if attributes are nominal or not + * + * @param schema + * @param expectedNominal Expected array of booleans. True if the field is nominal, false if not. + */ + final def assertNominal(schema: StructField, expectedNominal: Array[Boolean]): Assertion = { + val attributes = AttributeGroup.fromStructField(schema).attributes + attributes.map(_.map(_.isNominal).toSeq) shouldBe Some(expectedNominal.toSeq) + } +} diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/Base64VectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/Base64VectorizerTest.scala index c518495c4b..124469bc42 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/Base64VectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/Base64VectorizerTest.scala @@ -41,7 +41,7 @@ import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class Base64VectorizerTest extends FlatSpec with TestSparkContext with Base64TestData { +class Base64VectorizerTest extends FlatSpec with TestSparkContext with Base64TestData with AttributeAsserts { "Base64Vectorizer" should "vectorize random binary data" in { val vec = randomBase64.vectorize(topK = 10, minSupport = 0, cleanText = true, trackNulls = false) @@ -63,6 +63,8 @@ class Base64VectorizerTest extends FlatSpec with TestSparkContext with Base64Tes def assertVectorizer(vec: FeatureLike[OPVector], expected: Seq[Text]): Unit = { val result = new OpWorkflow().setResultFeatures(vec).transform(realData) val vectors = result.collect(vec) + val schema = result.schema(vec.name) + assertNominal(schema, Array.fill(vectors.head.value.size)(true)) vectors.length shouldBe expected.length // TODO add a more robust check diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala index c4e79457b9..fe007e1ab9 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryMapVectorizerTest.scala @@ -43,7 +43,8 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class BinaryMapVectorizerTest - extends OpEstimatorSpec[OPVector, SequenceModel[BinaryMap, OPVector], BinaryMapVectorizer[BinaryMap]] { + extends OpEstimatorSpec[OPVector, SequenceModel[BinaryMap, OPVector], BinaryMapVectorizer[BinaryMap]] + with AttributeAsserts { val (inputData, m1, m2) = TestFeatureBuilder("m1", "m2", Seq( @@ -73,6 +74,7 @@ class BinaryMapVectorizerTest transformed.collect(vector) shouldBe expectedResult val field = transformed.schema(estimator.getOutputFeatureName) + assertNominal(field, Array.fill(expectedResult.head.value.size)(true)) OpVectorMetadata(field) shouldEqual expectedMeta val vectorMetadata = estimator.getMetadata() OpVectorMetadata(field.copy(metadata = vectorMetadata)) shouldEqual expectedMeta @@ -100,6 +102,7 @@ class BinaryMapVectorizerTest transformed.collect(vector) shouldBe expected val field = transformed.schema(estimator.getOutputFeatureName) + assertNominal(field, Array.fill(expected.head.value.size)(true)) OpVectorMetadata(field) shouldEqual expectedMeta val vectorMetadata = estimator.getMetadata() OpVectorMetadata(field.copy(metadata = vectorMetadata)) shouldEqual expectedMeta diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryVectorizerTest.scala index 47b2ff50cb..9acdfe578c 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/BinaryVectorizerTest.scala @@ -41,7 +41,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer] { +class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer] with AttributeAsserts { val (inputData, f1, f2) = TestFeatureBuilder( Seq[(Binary, Binary)]( @@ -93,6 +93,8 @@ class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer] f1 -> List(RootCol, IndCol(Some(TransmogrifierDefaults.NullString))), f2 -> List(RootCol, IndCol(Some(TransmogrifierDefaults.NullString))) ) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) } it should "transform the data correctly [trackNulls=true,fillValue=true]" in { @@ -117,6 +119,8 @@ class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer] f1 -> List(RootCol, IndCol(Some(TransmogrifierDefaults.NullString))), f2 -> List(RootCol, IndCol(Some(TransmogrifierDefaults.NullString))) ) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) } it should "transform the data correctly [trackNulls=false,fillValue=false]" in { @@ -141,6 +145,8 @@ class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer] f1 -> List(RootCol), f2 -> List(RootCol) ) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) } it should "transform the data correctly [trackNulls=false,fillValue=true]" in { @@ -165,5 +171,7 @@ class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer] f1 -> List(RootCol), f2 -> List(RootCol) ) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) } } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala index 4cdf0413c0..0915fb2b9d 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateListVectorizerTest.scala @@ -37,6 +37,7 @@ import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder, TestOpVect import com.salesforce.op.utils.date.DateTimeUtils import com.salesforce.op.utils.spark.OpVectorMetadata import com.salesforce.op.utils.spark.RichDataset._ +import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.linalg.Vectors import org.joda.time.{DateTime, DateTimeConstants} import org.junit.runner.RunWith @@ -44,14 +45,16 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectorizer[DateList]] { +class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectorizer[DateList]] with AttributeAsserts { // Sunday July 12th 1998 at 22:45 val defaultDate = new DateTime(1998, 7, 12, 22, 45, DateTimeUtils.DefaultTimeZone).getMillis val now = TransmogrifierDefaults.ReferenceDate.minusMillis(1).getMillis // make date time be in the past private def daysToMilliseconds(n: Int): Long = n * DateTimeConstants.MILLIS_PER_DAY + private def monthsToMilliseconds(n: Int): Long = n * 2628000000L + private def hoursToMilliseconds(n: Int): Long = n * DateTimeConstants.MILLIS_PER_HOUR val (testData, clicks, opens, purchases) = TestFeatureBuilder("clicks", "opens", "purchases", @@ -122,7 +125,9 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori Vectors.dense(2.0, 1.0, -1.0).toOPVector ) - val fieldMetadata = transformed.schema(output.name).metadata + val schema = transformed.schema(output.name) + val fieldMetadata = schema.metadata + assertNominal(schema, Array.fill(testModelTimeSinceFirst.getInputFeatures().size)(false)) testModelTimeSinceFirst.getMetadata() shouldEqual fieldMetadata } @@ -148,7 +153,10 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori Vectors.dense(2.0, 0.0, 1.0, 0.0, -1.0, 0.0).toOPVector ) - val fieldMetadata = transformed.schema(output.name).metadata + val schema = transformed.schema(output.name) + val fieldMetadata = schema.metadata + assertNominal(schema, Array.fill(testModelTimeSinceFirst.getInputFeatures().size) + (Seq(false, true)).flatten) testModelTimeSinceFirst.getMetadata() shouldEqual fieldMetadata } @@ -174,7 +182,9 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori Vectors.dense(-28.0, -29.0, -31.0).toOPVector ) - val fieldMetadata = transformed.schema(output.name).metadata + val schema = transformed.schema(output.name) + val fieldMetadata = schema.metadata + assertNominal(schema, Array.fill(testModelTimeSinceFirst.getInputFeatures().size)(false)) testModelTimeSinceFirst.getMetadata() shouldEqual fieldMetadata } @@ -196,7 +206,9 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori Vectors.sparse(21, Array(), Array()).toOPVector ) - val fieldMetadata = transformed.schema(output.name).metadata + val schema = transformed.schema(output.name) + val fieldMetadata = schema.metadata + assertNominal(schema, Array.fill(testModelModeDay.getInputFeatures().size * 7)(true)) testModelModeDay.getMetadata() shouldEqual fieldMetadata val daysOfWeek = List("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday").map(s => @@ -225,7 +237,9 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori Vectors.sparse(24, Array(7, 15, 23), Array(1.0, 1.0, 1.0)).toOPVector ) - val fieldMetadata = transformed.schema(output.name).metadata + val schema = transformed.schema(output.name) + val fieldMetadata = schema.metadata + assertNominal(schema, Array.fill(testModelModeDay.getInputFeatures().size * 8)(true)) testModelModeDay.getMetadata() shouldEqual fieldMetadata val daysOfWeek = List("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday", @@ -253,7 +267,9 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori Vectors.sparse(36, Array(), Array()).toOPVector ) - val fieldMetadata = transformed.schema(output.name).metadata + val schema = transformed.schema(output.name) + val fieldMetadata = schema.metadata + assertNominal(schema, Array.fill(testModelModeMonth.getInputFeatures().size * 12)(true)) testModelModeMonth.getMetadata() shouldEqual fieldMetadata val months = List( @@ -283,7 +299,9 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori Vectors.sparse(72, Array(), Array()).toOPVector ) - val fieldMetadata = transformed.schema(output.name).metadata + val schema = transformed.schema(output.name) + val fieldMetadata = schema.metadata + assertNominal(schema, Array.fill(testModelModeHour.getInputFeatures().size * 24)(true)) testModelModeHour.getMetadata() shouldEqual fieldMetadata val hours = (0 until 24).map(i => IndCol(Some(s"$i:00"))).toList diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapToUnitCircleVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapToUnitCircleVectorizerTest.scala index 046d8f7f0f..675f6cf181 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapToUnitCircleVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapToUnitCircleVectorizerTest.scala @@ -44,7 +44,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class DateMapToUnitCircleVectorizerTest extends OpEstimatorSpec[OPVector, SequenceModel[DateMap, OPVector], - DateMapToUnitCircleVectorizer[DateMap]] { + DateMapToUnitCircleVectorizer[DateMap]] with AttributeAsserts { val eps = 1E-4 val sampleDateTimes = Seq[JDateTime]( @@ -77,7 +77,9 @@ class DateMapToUnitCircleVectorizerTest extends OpEstimatorSpec[OPVector, Sequen val output = f1.toUnitCircle(TimePeriod.HourOfDay) val transformed = output.originStage.asInstanceOf[DateMapToUnitCircleVectorizer[DateMap]] .fit(inputData).transform(inputData) + val field = transformed.schema(output.name) val actual = transformed.collect(output) + assertNominal(field, Array.fill(actual.head.value.size)(false)) all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } @@ -88,7 +90,9 @@ class DateMapToUnitCircleVectorizerTest extends OpEstimatorSpec[OPVector, Sequen val output = f1DT.toUnitCircle(TimePeriod.HourOfDay) val transformed = output.originStage.asInstanceOf[DateMapToUnitCircleVectorizer[DateMap]] .fit(inputData).transform(inputData) + val field = transformed.schema(output.name) val actual = transformed.collect(output) + assertNominal(field, Array.fill(actual.head.value.size)(false)) all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapVectorizerTest.scala index c68b25ae5b..8067f560f8 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateMapVectorizerTest.scala @@ -44,7 +44,7 @@ import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class DateMapVectorizerTest extends FlatSpec with TestSparkContext { +class DateMapVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { // Sunday July 12th 1998 at 22:45 private val defaultDate = new JDateTime(1998, 7, 12, 22, 45, DateTimeUtils.DefaultTimeZone).getMillis @@ -71,6 +71,8 @@ class DateMapVectorizerTest extends FlatSpec with TestSparkContext { val meta = OpVectorMetadata(vector.name, transformed.schema(vector.name).metadata) meta.columns.length shouldBe 3 meta.columns.map(_.grouping) should contain theSameElementsAs Array(Option("a"), Option("b"), Option("c")) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected(moment).head.value.size)(false)) val vector2 = f1.vectorize(defaultValue = 0, referenceDate = moment, trackNulls = true, circularDateReps = Seq()) @@ -80,6 +82,8 @@ class DateMapVectorizerTest extends FlatSpec with TestSparkContext { val meta2 = OpVectorMetadata(vector2.name, transformed2.schema(vector2.name).metadata) meta2.columns.length shouldBe 6 meta2.history.keys.size shouldBe 1 + val field2 = transformed2.schema(vector2.name) + assertNominal(field2, Array.fill(expected(moment).head.value.size)(Seq(false, true)).flatten) val vector3 = f1.vectorize(defaultValue = 0) val transformed3 = new OpWorkflow().setResultFeatures(vector3).transform(ds) @@ -88,6 +92,9 @@ class DateMapVectorizerTest extends FlatSpec with TestSparkContext { val meta3 = OpVectorMetadata(vector3.name, transformed3.schema(vector3.name).metadata) meta3.columns.length shouldBe 30 meta2.history.keys.size shouldBe 1 + val field3 = transformed3.schema(vector3.name) + val expectedNominal = Array.fill(24)(false) ++ Array.fill(3)(Seq(false, true)).flatten.asInstanceOf[Array[Boolean]] + assertNominal(field3, expectedNominal) } private def expected(moment: JDateTime) = { diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateTimeVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateTimeVectorizerTest.scala index 2dbf2603dd..4f878f127e 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateTimeVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateTimeVectorizerTest.scala @@ -45,7 +45,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class DateTimeVectorizerTest extends FlatSpec with TestSparkContext { +class DateTimeVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { // Sunday July 12th 1998 at 22:45 private val defaultDate = new JDateTime(1998, 7, 12, 22, 45, DateTimeUtils.DefaultTimeZone).getMillis @@ -91,6 +91,8 @@ class DateTimeVectorizerTest extends FlatSpec with TestSparkContext { val meta = OpVectorMetadata(vector.name, transformed.schema(vector.name).metadata) meta.columns.length shouldBe 3 meta.history.keys.size shouldBe 3 + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected(moment).head.value.size)(false)) val vector2 = f1.vectorize( dateListPivot = TransmogrifierDefaults.DateListDefault, @@ -105,6 +107,8 @@ class DateTimeVectorizerTest extends FlatSpec with TestSparkContext { val meta2 = OpVectorMetadata(vector2.name, transformed2.schema(vector2.name).metadata) meta2.columns.length shouldBe 6 meta2.history.keys.size shouldBe 3 + val field2 = transformed2.schema(vector2.name) + assertNominal(field2, Array.fill(expected(moment).head.value.size)(Seq(false, true)).flatten) val vector3 = f1.vectorize( dateListPivot = TransmogrifierDefaults.DateListDefault, @@ -117,6 +121,9 @@ class DateTimeVectorizerTest extends FlatSpec with TestSparkContext { val meta3 = OpVectorMetadata(vector3.name, transformed3.schema(vector3.name).metadata) meta3.columns.length shouldBe 30 meta3.history.keys.size shouldBe 6 + val field3 = transformed3.schema(vector3.name) + val expectedNominal = Array.fill(24)(false) ++ Array.fill(3)(Seq(false, true)).flatten.asInstanceOf[Array[Boolean]] + assertNominal(field3, expectedNominal) } it should "vectorize dates correctly any time" in { diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateToUnitCircleTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateToUnitCircleTransformerTest.scala index 4a3de8f06b..7c0dede857 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateToUnitCircleTransformerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateToUnitCircleTransformerTest.scala @@ -43,7 +43,8 @@ import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateToUnitCircleTransformer[Date]] { +class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateToUnitCircleTransformer[Date]] + with AttributeAsserts { val eps = 1E-4 val sampleDateTimes = Seq[JDateTime]( @@ -66,7 +67,10 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo val vectorizer = new DateToUnitCircleTransformer().setTimePeriod(timePeriod).setInput(f) val transformed = vectorizer.transform(ds) val vector = vectorizer.getOutput() - transformed.collect(vector) + val actual = transformed.collect(vector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(actual.head.value.size)(false)) + actual } def indexSeqToUnitCircle(indices: Seq[Int], numIndices: Int): Seq[OPVector] = { @@ -80,7 +84,9 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo val output = dateFeature.toUnitCircle(TimePeriod.HourOfDay) val transformed = output.originStage.asInstanceOf[Transformer].transform(ds) val actual = transformed.collect(output) - all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + all(actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + val field = transformed.schema(output.name) + assertNominal(field, Array.fill(actual.head.value.size)(false)) } it should "work with its DateTime shortcut" in { @@ -89,7 +95,9 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo val output = dateTimeFeature.toUnitCircle(TimePeriod.HourOfDay) val transformed = output.originStage.asInstanceOf[Transformer].transform(ds) val actual = transformed.collect(output) - all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + all(actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + val field = transformed.schema(output.name) + assertNominal(field, Array.fill(actual.head.value.size)(false)) } it should "store the proper meta data" in { @@ -98,7 +106,7 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo val vectorizer = new DateToUnitCircleTransformer().setTimePeriod(HourOfDay).setInput(feature) val transformed = vectorizer.transform(ds) val meta = OpVectorMetadata(transformed.schema(vectorizer.getOutput().name)) - meta.columns.length should equal (2) + meta.columns.length should equal(2) meta.columns(0).descriptorValue shouldBe Some("x_HourOfDay") meta.columns(1).descriptorValue shouldBe Some("y_HourOfDay") } @@ -117,12 +125,14 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo Array(0.0, 0.0), Array(1.0, 0.0) ).map(Vectors.dense(_).toOPVector) - all (actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + all(actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(actual.head.value.size)(false)) } it should "transform the data correctly when the timePeriod is HourOfDay" in { val actual = transformData(sampleDateTimes, HourOfDay) - all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + all(actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } it should "transform the data correctly when the timePeriod is DayOfYear" in { @@ -136,7 +146,7 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo val actual = transformData(dateTimes, DayOfYear) val sampleDaysOfYearMinusOne = Array(0, 1, 2, 3, 31) val expected = indexSeqToUnitCircle(sampleDaysOfYearMinusOne, 366) - all (actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + all(actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } it should "transform the data correctly when the timePeriod is DayOfWeek" in { @@ -151,7 +161,7 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo ) val actual = transformData(dateTimes, DayOfWeek) val expectedDaysOfWeekMinusOne = indexSeqToUnitCircle(Seq(0, 1, 2, 3, 4, 5, 6), 7) - all (actual.zip(expectedDaysOfWeekMinusOne).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + all(actual.zip(expectedDaysOfWeekMinusOne).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } it should "transform the data correctly when the timePeriod is WeekOfYear" in { @@ -165,14 +175,14 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo val actual = transformData(dateTimes, WeekOfYear) val sampleWeeksOfYearMinusOne = Seq(51, 0, 1, 2, 3) val expected = indexSeqToUnitCircle(sampleWeeksOfYearMinusOne, 53) - all (actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + all(actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } it should "transform the data correctly when the timePeriod is DayOfMonth" in { val actual = transformData(sampleDateTimes, DayOfMonth) val sampleDaysOfMonthMinusOne = Seq(10, 27, 16, 16, 12) val expected = indexSeqToUnitCircle(sampleDaysOfMonthMinusOne, 31) - all (actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + all(actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } it should "transform the data correctly when the timePeriod is MonthOfYear" in { @@ -186,7 +196,7 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo val actual = transformData(dateTimes, MonthOfYear) val sampleMonthsOfYearMinusOne = Seq(0, 1, 2, 3, 11) val expected = indexSeqToUnitCircle(sampleMonthsOfYearMinusOne, 12) - all (actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + all(actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } it should "transform the data correctly when the timePeriod is WeekOfMonth" in { @@ -199,6 +209,6 @@ class DateToUnitCircleTransformerTest extends OpTransformerSpec[OPVector, DateTo ) val actual = transformData(dateTimes, WeekOfMonth) val expected = indexSeqToUnitCircle(Seq(0, 1, 2, 3, 4), 6) - all (actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps + all(actual.zip(expected).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps } } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateVectorizerTest.scala index a44b75a22d..a92069621b 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DateVectorizerTest.scala @@ -44,7 +44,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class DateVectorizerTest extends FlatSpec with TestSparkContext { +class DateVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { require(DateTimeUtils.DefaultTimeZone == DateTimeZone.UTC) // Sunday July 12th 1998 at 22:45 @@ -69,6 +69,8 @@ class DateVectorizerTest extends FlatSpec with TestSparkContext { val meta = OpVectorMetadata(vector.name, transformed.schema(vector.name).metadata) meta.columns.length shouldBe 3 meta.history.keys.size shouldBe 3 + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expectedAt(moment).head.value.size)(false)) val vector2 = f1.vectorize( dateListPivot = TransmogrifierDefaults.DateListDefault, @@ -83,6 +85,8 @@ class DateVectorizerTest extends FlatSpec with TestSparkContext { val meta2 = OpVectorMetadata(vector2.name, transformed2.schema(vector2.name).metadata) meta2.columns.length shouldBe 6 meta2.history.keys.size shouldBe 3 + val field2 = transformed2.schema(vector2.name) + assertNominal(field2, Array.fill(expectedAt(moment).head.value.size)(Seq(false, true)).flatten) val vector3 = f1.vectorize( dateListPivot = TransmogrifierDefaults.DateListDefault, @@ -94,6 +98,9 @@ class DateVectorizerTest extends FlatSpec with TestSparkContext { val meta3 = OpVectorMetadata(vector3.name, transformed3.schema(vector3.name).metadata) meta3.columns.length shouldBe 30 meta3.history.keys.size shouldBe 6 + val field3 = transformed3.schema(vector3.name) + val expectedNominal = Array.fill(24)(false) ++ Array.fill(3)(Seq(false, true)).flatten.asInstanceOf[Array[Boolean]] + assertNominal(field3, expectedNominal) } private def buildTestData(moment: DateTime) = { diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala index b2d916e8f1..7a58d35825 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericBucketizerTest.scala @@ -48,7 +48,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class DecisionTreeNumericBucketizerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, Real, OPVector], DecisionTreeNumericBucketizer[Double, Real]] - with DecisionTreeNumericBucketizerAsserts + with DecisionTreeNumericBucketizerAsserts with AttributeAsserts { val (inputData, estimator) = { val numericData = Seq(1.0.toReal, 18.0.toReal, Real.empty, (-1.23).toReal, 0.0.toReal) @@ -203,8 +203,11 @@ class DecisionTreeNumericBucketizerTest extends OpEstimatorSpec[OPVector, val splits = model.splits assertSplits(splits = splits, expectedSplits = expectedSplits, expectedTolerance) - val res = model.transform(data).collect(out) - assertMetadata( + val transformed = model.transform(data) + val res = transformed.collect(out) + val field = transformed.schema(out.name) + assertNominal(field, Array.fill(res.head.value.size)(true)) + assertMetadata( shouldSplit = Array(shouldSplit), splits = Array(splits), trackNulls = trackNulls, trackInvalid = trackInvalid, diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala index 512a30c01e..aecb39fe6e 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DecisionTreeNumericMapBucketizerTest.scala @@ -47,7 +47,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class DecisionTreeNumericMapBucketizerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, RealMap, OPVector], DecisionTreeNumericMapBucketizer[Double, RealMap]] - with DecisionTreeNumericBucketizerAsserts + with DecisionTreeNumericBucketizerAsserts with AttributeAsserts { import OPMapVectorizerTestHelper._ @@ -228,6 +228,8 @@ class DecisionTreeNumericMapBucketizerTest extends OpEstimatorSpec[OPVector, ) val scored = model.setInputDataset(data).score(keepIntermediateFeatures = true) val res = scored.collect(out) + val field = scored.schema(out.name) + assertNominal(field, Array.fill(res.head.value.size)(true)) assertMetadata( shouldSplit = stage.shouldSplitByKey.toArray.sortBy(_._1).map(_._2), splits = stage.splitsByKey.toArray.sortBy(_._1).map(_._2), diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DropIndicesByTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DropIndicesByTransformerTest.scala index f73cedccc0..b2f9f9570d 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/DropIndicesByTransformerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/DropIndicesByTransformerTest.scala @@ -43,9 +43,8 @@ import org.scalatest.junit.JUnitRunner import org.apache.spark.sql.functions._ - @RunWith(classOf[JUnitRunner]) -class DropIndicesByTransformerTest extends OpTransformerSpec[OPVector, DropIndicesByTransformer] { +class DropIndicesByTransformerTest extends OpTransformerSpec[OPVector, DropIndicesByTransformer] with AttributeAsserts { val (inputData, transformer) = { val vecData = Seq( @@ -73,8 +72,11 @@ class DropIndicesByTransformerTest extends OpTransformerSpec[OPVector, DropIndic .setInput(vectorizedPicklist) .getOutput() val materializedFeatures = new OpWorkflow().setResultFeatures(vectorizedPicklist, prunedVector).transform(df) + val field = materializedFeatures.schema(prunedVector.name) + val collectedFeatures = materializedFeatures.collect(prunedVector) + assertNominal(field, Array.fill(collectedFeatures.head.value.size)(true)) - materializedFeatures.collect(prunedVector).foreach(_.value.size shouldBe 4) + collectedFeatures.foreach(_.value.size shouldBe 4) materializedFeatures.collect().foreach { r => if (r.getString(0) == "Red") r.getAs[Vector](2).toArray.forall(_ == 0) shouldBe true else r.getAs[Vector](2).toArray.max shouldBe 1 @@ -89,9 +91,12 @@ class DropIndicesByTransformerTest extends OpTransformerSpec[OPVector, DropIndic val vectorizedPicklist = picklistFeature.vectorize(topK = 10, minSupport = 3, cleanText = false) val prunedVector = vectorizedPicklist.dropIndicesBy(_.isNullIndicator) val materializedFeatures = new OpWorkflow().setResultFeatures(vectorizedPicklist, prunedVector).transform(df) + val field = materializedFeatures.schema(prunedVector.name) + val collectedFeatures = materializedFeatures.collect(prunedVector) + assertNominal(field, Array.fill(collectedFeatures.head.value.size)(true)) - materializedFeatures.collect(prunedVector).foreach(_.value.size shouldBe 4) - materializedFeatures.collect().foreach( _.getAs[Vector](2).toArray.max shouldBe 1) + collectedFeatures.foreach(_.value.size shouldBe 4) + materializedFeatures.collect().foreach(_.getAs[Vector](2).toArray.max shouldBe 1) val rawMeta = OpVectorMetadata(vectorizedPicklist.name, vectorizedPicklist.originStage.getMetadata()) val trimmedMeta = OpVectorMetadata(materializedFeatures.schema(prunedVector.name)) rawMeta.columns.length - 1 shouldBe trimmedMeta.columns.length diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/EmailVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/EmailVectorizerTest.scala index b193403327..a3f84f6a1a 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/EmailVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/EmailVectorizerTest.scala @@ -45,7 +45,8 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class EmailVectorizerTest - extends FlatSpec with FeatureTestBase with RichMapFeature with RichFeature with RichTextFeature { + extends FlatSpec with FeatureTestBase with RichMapFeature with RichFeature with RichTextFeature + with AttributeAsserts { val emailKey = "Email1" val emailKey2 = "Email2" val emails = (RandomText.emails("salesforce.com").take(2) ++ RandomText.emails("einstein.ai").take(2)).toSeq @@ -80,8 +81,13 @@ class EmailVectorizerTest ).map(_.toOPVector) - def transformAndCollect(ds: DataFrame, feature: FeatureLike[OPVector]): Array[OPVector] = - new OpWorkflow().setResultFeatures(feature).transform(ds).collect(feature) + def transformAndCollect(ds: DataFrame, feature: FeatureLike[OPVector]): Array[OPVector] = { + val transformed = new OpWorkflow().setResultFeatures(feature).transform(ds) + val field = transformed.schema(feature.name) + val collected = transformed.collect(feature) + assertNominal(field, Array.fill(collected.head.value.size)(true)) + collected + } Spec[RichEmailMapFeature] should "vectorize EmailMaps correctly" in { val (ds1, f1) = TestFeatureBuilder(emails.map(e => Map(emailKey -> e.value.get).toEmailMap)) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationMapVectorizerTest.scala index bdf3674c58..92531ee2d2 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationMapVectorizerTest.scala @@ -44,7 +44,8 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class GeolocationMapVectorizerTest - extends OpEstimatorSpec[OPVector, SequenceModel[GeolocationMap, OPVector], GeolocationMapVectorizer] { + extends OpEstimatorSpec[OPVector, SequenceModel[GeolocationMap, OPVector], GeolocationMapVectorizer] + with AttributeAsserts { val (inputData, m1, m2) = TestFeatureBuilder("m1", "m2", Seq( @@ -93,7 +94,8 @@ class GeolocationMapVectorizerTest val vectorizer = estimator.fit(inputData) val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() - + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expectedResult.head.value.size)(false)) transformed.collect(vector) shouldBe expectedResult transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() @@ -112,6 +114,9 @@ class GeolocationMapVectorizerTest Vectors.sparse(24, Array(3, 7, 11, 15, 19, 23), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, + Array.fill(expected.head.value.size / 4)(Seq(false, false, false, true)).flatten) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMetaTrackNulls val vectorMetadata = vectorizer.getMetadata() @@ -128,6 +133,8 @@ class GeolocationMapVectorizerTest Array(6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 33.8, -108.7, 2.0, 40.4, -116.3, 2.0, 42.5, -95.4, 4.0, 6.0, 6.0, 6.0), Array.fill(18)(6.0) ).map(v => Vectors.dense(v).toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(false)) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta @@ -147,6 +154,9 @@ class GeolocationMapVectorizerTest 0.0, 6.0, 6.0, 6.0, 1.0), (0 until 6).flatMap(k => Seq.fill(3)(6.0) :+ 1.0).toArray ).map(v => Vectors.dense(v).toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, + Array.fill(expected.head.value.size / 4)(Seq(false, false, false, true)).flatten) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMetaTrackNulls @@ -164,6 +174,9 @@ class GeolocationMapVectorizerTest Vectors.sparse(9, Array(), Array()), Vectors.sparse(9, Array(), Array()) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(false)) + val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> (Geolocation.Names.map(n => DescColWithGroup(Option(n), "A")) ++ @@ -188,6 +201,10 @@ class GeolocationMapVectorizerTest Vectors.sparse(12, Array(3, 7, 11), Array(1.0, 1.0, 1.0)), Vectors.sparse(12, Array(3, 7, 11), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, + Array.fill(expected.head.value.size / 4)(Seq(false, false, false, true)).flatten) + val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> ( @@ -215,6 +232,9 @@ class GeolocationMapVectorizerTest Vectors.dense(Array(0.0, 0.0, 0.0, 33.8, -108.7, 2.0, 40.4, -116.3, 2.0, 42.5, -95.4, 4.0)), Vectors.sparse(12, Array(), Array()) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(false)) + val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> (Geolocation.Names.map(n => DescColWithGroup(Option(n), "B")) ++ @@ -239,6 +259,10 @@ class GeolocationMapVectorizerTest Vectors.dense(Array(0.0, 0.0, 0.0, 1.0, 33.8, -108.7, 2.0, 0.0, 40.4, -116.3, 2.0, 0.0, 42.5, -95.4, 4.0, 0.0)), Vectors.sparse(16, Array(3, 7, 11, 15), Array(1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, + Array.fill(expected.head.value.size / 4)(Seq(false, false, false, true)).flatten) + val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> ( @@ -263,6 +287,9 @@ class GeolocationMapVectorizerTest val vector = vectorizer.getOutput() val expectedOutput = transformed.collect() + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size / 4) + (Seq(false, false, false, true)).flatten) // Now using the shortcut val res = m1.vectorize(cleanKeys = TransmogrifierDefaults.CleanKeys, others = Array(m2)) res.originStage shouldBe a[GeolocationMapVectorizer] diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationVectorizerTest.scala index 305d6749c5..03157b32f5 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/GeolocationVectorizerTest.scala @@ -39,10 +39,11 @@ import org.apache.spark.ml.linalg.{DenseVector, Vectors} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Assertions, FlatSpec, Matchers} +import com.salesforce.op.utils.spark.RichDataset._ @RunWith(classOf[JUnitRunner]) -class GeolocationVectorizerTest extends FlatSpec with TestSparkContext { +class GeolocationVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { val (testData, inA, inB, inC, inD) = TestFeatureBuilder("inA", "inB", "inC", "inD", Seq[(Geolocation, Geolocation, Geolocation, Geolocation)]( @@ -97,6 +98,10 @@ class GeolocationVectorizerTest extends FlatSpec with TestSparkContext { (Array(45.0, -105.5, 4.0), null, null, null, Vectors.dense(45.0, -105.5, 4.0, 50.0, 50.0, 4.0, 50.0, 50.0, 4.0, 50.0, 50.0, 4.0)) ) + val output = testModelConstant.getOutputFeatureName + val field = testDataTransformedConstant.schema(output) + assertNominal( + field, Array.fill(expectedConstant.head._5.size)(false)) transformedValuesConstant.map(_.get(0)) shouldEqual expectedConstant.map(_._1) transformedValuesConstant.map(_.get(1)) shouldEqual expectedConstant.map(_._2) @@ -129,7 +134,10 @@ class GeolocationVectorizerTest extends FlatSpec with TestSparkContext { (Array(45.0, -105.5, 4.0), null, null, null, Vectors.dense(Array(45.0, -105.5, 4.0) ++ mean1 ++ mean2 ++ mean3)) ) - + val output = testModelMean.getOutputFeatureName + val field = testDataTransformedMean.schema(output) + assertNominal( + field, Array.fill(expectedMean.head._5.size)(false)) transformedValuesMean.map(_.get(0)) shouldEqual expectedMean.map(_._1) transformedValuesMean.map(_.get(1)) shouldEqual expectedMean.map(_._2) transformedValuesMean.map(_.get(2)) shouldEqual expectedMean.map(_._3) @@ -169,7 +177,10 @@ class GeolocationVectorizerTest extends FlatSpec with TestSparkContext { (Array(45.0, -105.5, 4.0), null, null, null, Vectors.dense(Array(45.0, -105.5, 4.0, 0.0) ++ mean1 ++ mean2 ++ mean3)) ) - + val output = testModelMean.getOutputFeatureName + val field = testDataTransformedMean.schema(output) + assertNominal( + field, Array.fill(expectedMean.head._5.size / 4)(Seq(false, false, false, true)).flatten) transformedValuesMean.map(_.get(0)) shouldEqual expectedMean.map(_._1) transformedValuesMean.map(_.get(1)) shouldEqual expectedMean.map(_._2) transformedValuesMean.map(_.get(2)) shouldEqual expectedMean.map(_._3) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralMapVectorizerTest.scala index 2f54a056be..0dcbc0ac9e 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralMapVectorizerTest.scala @@ -44,7 +44,8 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class IntegralMapVectorizerTest - extends OpEstimatorSpec[OPVector, SequenceModel[IntegralMap, OPVector], IntegralMapVectorizer[IntegralMap]] { + extends OpEstimatorSpec[OPVector, SequenceModel[IntegralMap, OPVector], IntegralMapVectorizer[IntegralMap]] + with AttributeAsserts { val (inputData, m1, m2) = TestFeatureBuilder("m1", "m2", Seq( @@ -83,6 +84,8 @@ class IntegralMapVectorizerTest it should "return a model that correctly transforms the data and produces metadata" in { val vector = estimator.getOutput() val transformed = model.transform(inputData) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expectedResult.head.value.size)(false)) transformed.collect(vector) shouldBe expectedResult transformed.schema.toOpVectorMetadata(estimator.getOutputFeatureName) shouldEqual expectedMeta @@ -100,6 +103,8 @@ class IntegralMapVectorizerTest Vectors.sparse(12, Array(1, 3, 4, 8, 11), Array(1.0, 1.0, 11.0, 3.0, 1.0)), Vectors.sparse(12, Array(1, 3, 5, 7, 9, 11), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(6)(Seq(false, true)).flatten) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMetaTrackNulls @@ -117,6 +122,8 @@ class IntegralMapVectorizerTest Vectors.dense(Array(100.0, 100.0, 11.0, 0.0, 3.0, 100.0)), Vectors.dense(Array(100.0, 100.0, 100.0, 100.0, 100.0, 100.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(false)) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta @@ -134,6 +141,8 @@ class IntegralMapVectorizerTest Vectors.dense(Array(100.0, 1.0, 100.0, 1.0, 11.0, 0.0, 0.0, 0.0, 3.0, 0.0, 100.0, 1.0)), Vectors.dense(Array(100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(6)(Seq(false, true)).flatten) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMetaTrackNulls @@ -153,6 +162,8 @@ class IntegralMapVectorizerTest Vectors.sparse(3, Array(), Array()), Vectors.sparse(3, Array(), Array()) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(false)) val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(None, "B")), @@ -177,6 +188,8 @@ class IntegralMapVectorizerTest Vectors.sparse(6, Array(1, 3, 5), Array(1.0, 1.0, 1.0)), Vectors.sparse(6, Array(1, 3, 5), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(3)(Seq(false, true)).flatten) val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(nullIndicatorValue, "A"), @@ -201,6 +214,8 @@ class IntegralMapVectorizerTest Vectors.dense(Array(0.0, 11.0, 0.0, 3.0)), Vectors.sparse(4, Array(), Array()) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(false)) val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> List(IndColWithGroup(None, "B"), IndColWithGroup(None, "C")), @@ -224,6 +239,8 @@ class IntegralMapVectorizerTest Vectors.sparse(8, Array(1, 2, 6), Array(1.0, 11.0, 3.0)), Vectors.sparse(8, Array(1, 3, 5, 7), Array(1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(4)(Seq(false, true)).flatten) val expectedMeta = TestOpVectorMetadataBuilder( vectorizer, m1 -> List(IndColWithGroup(None, "B"), IndColWithGroup(nullIndicatorValue, "B"), diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralVectorizerTest.scala index 6d6db538a0..49cc689cc1 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/IntegralVectorizerTest.scala @@ -42,7 +42,7 @@ import org.scalatest.{Assertions, FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) -class IntegralVectorizerTest extends FlatSpec with TestSparkContext { +class IntegralVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { val (testData, inA, inB, inC, inD) = TestFeatureBuilder("inA", "inB", "inC", "inD", Seq( @@ -113,7 +113,8 @@ class IntegralVectorizerTest extends FlatSpec with TestSparkContext { (null, 2L, 2L, null, Vectors.dense(3.0, 2.0, 2.0, 3.0)), (null, null, null, null, Vectors.dense(3.0, 3.0, 3.0, 3.0)) ) - + val field = testDataTransformedConstant.schema(testModelConstant.getOutputFeatureName) + assertNominal(field, Array.fill(expectedZero.head._5.size)(false)) transformedValuesConstant.map(_.get(0)) shouldEqual expectedZero.map(_._1) transformedValuesConstant.map(_.get(1)) shouldEqual expectedZero.map(_._2) transformedValuesConstant.map(_.get(2)) shouldEqual expectedZero.map(_._3) @@ -142,7 +143,8 @@ class IntegralVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, 2.0, null, Vectors.dense(4.0, 2.0, 2.0, 0.0)), (null, null, null, null, Vectors.dense(4.0, 2.0, 1.0, 0.0)) ) - + val field = testDataTransformedMode.schema(testModelMode.getOutputFeatureName) + assertNominal(field, Array.fill(expectedMode.head._5.size)(false)) transformedValuesMode.map(_.get(0)) shouldEqual expectedMode.map(_._1) transformedValuesMode.map(_.get(1)) shouldEqual expectedMode.map(_._2) transformedValuesMode.map(_.get(2)) shouldEqual expectedMode.map(_._3) @@ -168,6 +170,8 @@ class IntegralVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, 2.0, null, Vectors.dense(0.0, 1.0, 2.0, 0.0, 2.0, 0.0, 0.0, 1.0)), (null, null, null, null, Vectors.dense(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0)) ) + val field = testDataTransformedConstantTracked.schema(testModelConstantTracked.getOutputFeatureName) + assertNominal(field, Array.fill(expectedZeroTracked.head._5.size / 2)(Seq(false, true)).flatten) transformedValuesZeroTracked.map(_.get(0)) shouldEqual expectedZeroTracked.map(_._1) transformedValuesZeroTracked.map(_.get(1)) shouldEqual expectedZeroTracked.map(_._2) @@ -206,6 +210,8 @@ class IntegralVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, 2.0, null, Vectors.dense(4.0, 1.0, 2.0, 0.0, 2.0, 0.0, 0.0, 1.0)), (null, null, null, null, Vectors.dense(4.0, 1.0, 2.0, 1.0, 1.0, 1.0, 0.0, 1.0)) ) + val field = testDataTransformedModeTracked.schema(testModelModeTracked.getOutputFeatureName) + assertNominal(field, Array.fill(expectedModeTracked.head._5.size / 2)(Seq(false, true)).flatten) transformedValuesModeTracked.map(_.get(0)) shouldEqual expectedModeTracked.map(_._1) transformedValuesModeTracked.map(_.get(1)) shouldEqual expectedModeTracked.map(_._2) @@ -248,6 +254,8 @@ class IntegralVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, 2.0, null, Vectors.dense(4.0, 2.0, 2.0, 0.0)), (null, null, null, null, Vectors.dense(4.0, 2.0, 1.0, 0.0)) ) + val field = testDataTransformedMode.schema(testModelMode.getOutputFeatureName) + assertNominal(field, Array.fill(expectedMode.head._5.size)(false)) transformedValuesMode.map(_.get(0)) shouldEqual expectedMode.map(_._1) transformedValuesMode.map(_.get(1)) shouldEqual expectedMode.map(_._2) @@ -278,6 +286,8 @@ class IntegralVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, 2.0, null, Vectors.dense(4.0, 2.0, 2.0, 0.0)), (null, null, null, null, Vectors.dense(4.0, 2.0, 1.0, 0.0)) ) + val field = testDataTransformedMode.schema(testModelMode.getOutputFeatureName) + assertNominal(field, Array.fill(expectedMode.head._5.size)(false)) transformedValuesMode.map(_.get(0)) shouldEqual expectedMode.map(_._1) transformedValuesMode.map(_.get(1)) shouldEqual expectedMode.map(_._2) @@ -308,6 +318,8 @@ class IntegralVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, 2.0, null, Vectors.dense(4.0, 2.0, 2.0, 0.0)), (null, null, null, null, Vectors.dense(4.0, 2.0, 1.0, 0.0)) ) + val field = testDataTransformedMode.schema(testModelMode.getOutputFeatureName) + assertNominal(field, Array.fill(expectedMode.head._5.size)(false)) transformedValuesMode.map(_.get(0)) shouldEqual expectedMode.map(_._1) transformedValuesMode.map(_.get(1)) shouldEqual expectedMode.map(_._2) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/MultiPickListMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/MultiPickListMapVectorizerTest.scala index 501bd55b7d..a26996e006 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/MultiPickListMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/MultiPickListMapVectorizerTest.scala @@ -46,7 +46,7 @@ import org.slf4j.LoggerFactory @RunWith(classOf[JUnitRunner]) -class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { +class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { val log = LoggerFactory.getLogger(this.getClass) @@ -94,6 +94,10 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { it should "return the a fitted vectorizer with the correct default parameters" in { val fitted = vectorizer.setTrackNulls(false).fit(dataSet) fitted shouldBe a[SequenceModel[_, _]] + val transformed = fitted.transform(dataSet) + val vector = vectorizer.getOutput() + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) val vectorMetadata = fitted.getMetadata() OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual TestOpVectorMetadataBuilder(vectorizer, @@ -115,6 +119,10 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { it should "track nulls with the correct default parameters" in { val fitted = vectorizer.setTrackNulls(true).fit(dataSet) fitted shouldBe a[SequenceModel[_, _]] + val transformed = fitted.transform(dataSet) + val vector = vectorizer.getOutput() + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) val vectorMetadata = fitted.getMetadata() OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual TestOpVectorMetadataBuilder(vectorizer, @@ -151,6 +159,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(14, Array(0, 7, 9), Array(1.0, 1.0, 1.0)), Vectors.sparse(14, Array(0, 2, 11), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected fitted.getMetadata() shouldBe transformed.schema.fields(2).metadata } @@ -167,6 +177,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(20, Array(0, 6, 9, 10, 13, 19), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)), Vectors.sparse(20, Array(0, 3, 9, 12, 15, 16), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected fitted.getMetadata() shouldBe transformed.schema.fields(2).metadata } @@ -183,6 +195,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(17, Array(0, 9, 11), Array(1.0, 1.0, 1.0)), Vectors.sparse(17, Array(1, 3, 14), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual TestOpVectorMetadataBuilder(vectorizer, @@ -212,6 +226,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(23, Array(0, 7, 10, 12, 15, 22), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)), Vectors.sparse(23, Array(1, 4, 10, 14, 18, 19), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual TestOpVectorMetadataBuilder(vectorizer, @@ -243,6 +259,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(12, Array(0, 6, 8), Array(1.0, 1.0, 1.0)), Vectors.sparse(12, Array(0, 2, 10), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -258,6 +276,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(18, Array(0, 5, 8, 9, 12, 17), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)), Vectors.sparse(18, Array(0, 3, 8, 11, 14, 15), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -271,6 +291,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(10, Array(0, 5, 7), Array(1.0, 1.0, 1.0)), Vectors.sparse(10, Array(0, 2, 9), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -284,6 +306,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(16, Array(0, 5, 7, 8, 11, 15), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)), Vectors.sparse(16, Array(0, 3, 7, 10, 13, 14), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -298,6 +322,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 1.0, 0.0, 0.0, 0.0), Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected @@ -308,6 +334,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0), Vectors.dense(1.0, 0.0, 0.0, 0.0, 0.0) ).map(_.toOPVector) + val field2 = transformed2.schema(vector.name) + assertNominal(field2, Array.fill(expected2.head.value.size)(true)) transformed2.collect(vector) shouldBe expected2 } @@ -322,6 +350,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected @@ -332,6 +362,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0), Vectors.dense(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0) ).map(_.toOPVector) + val field2 = transformed2.schema(vector.name) + assertNominal(field2, Array.fill(expected2.head.value.size)(true)) transformed2.collect(vector) shouldBe expected2 } @@ -344,6 +376,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(Array.empty[Double]), Vectors.dense(Array.empty[Double]) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -361,6 +395,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(5, Array(3), Array(1.0)), Vectors.sparse(5, Array(0), Array(1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -378,6 +414,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(7, Array(3, 4), Array(1.0, 1.0)), Vectors.sparse(7, Array(0, 6), Array(1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -395,6 +433,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(9, Array(0, 7), Array(1.0, 1.0)), Vectors.sparse(9, Array(0, 4), Array(1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -412,6 +452,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(13, Array(0, 5, 9, 10), Array(1.0, 1.0, 1.0, 1.0)), Vectors.sparse(13, Array(0, 5, 6, 12), Array(1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -431,6 +473,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(8, Array(3, 7), Array(2.0, 1.0)), Vectors.sparse(8, Array(0, 2, 3, 4, 5, 6), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -450,6 +494,8 @@ class MultiPickListMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(10, Array(3, 8), Array(2.0, 1.0)), Vectors.dense(1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericBucketizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericBucketizerTest.scala index b313edd76f..a35ae9dae9 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericBucketizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericBucketizerTest.scala @@ -44,7 +44,7 @@ import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class NumericBucketizerTest extends FlatSpec with TestSparkContext { +class NumericBucketizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { trait GenericTest { val numbers = Seq(Some(10.0), None, Some(3.0), Some(5.0), Some(6.0), None, Some(1.0), Some(0.0)) @@ -158,7 +158,7 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { buck.getBucketLabels shouldBe Array("A", "B", "C") } - it should "throw an exception if the data is out of bounds when trackInvalid is false" in new GenericTest { + it should "throw an exception if the data is out of bounds when trackInvalid is false" in new GenericTest { val vals = Seq(Double.PositiveInfinity, Double.NaN, -1, -100).map(_.toReal) lazy val (data, num) = TestFeatureBuilder("num", vals) val buck = new NumericBucketizer[Real]().setInput(num).setBuckets(splits).setTrackInvalid(false) @@ -168,7 +168,11 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { it should "transform the data correctly (reals)" in new RealTest { val vector = realBucketizer.getOutput() - realBucketizer.transform(data1).collect(vector) shouldBe expectedAns + val transformed = realBucketizer.transform(data1) + val actual = transformed.collect(vector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(actual.head.value.size)(true)) + actual shouldBe expectedAns val expectedMeta = TestOpVectorMetadataBuilder( realBucketizer, @@ -182,19 +186,31 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { IndCol(Some("[5.0-10.0)")), IndCol(Some("[10.0-Infinity)"))) ) OpVectorMetadata(realBucketizer2.getOutputFeatureName, realBucketizer2.getMetadata()) shouldEqual expectedMeta2 + val vector2 = realBucketizer2.getOutput() + val transformed2 = realBucketizer2.transform(data1) + val actual2 = transformed2.collect(vector2) + val field2 = transformed2.schema(vector2.name) + assertNominal(field2, Array.fill(actual2.head.value.size)(true)) } it should "work as a shortcut (reals)" in new RealTest { val vector = num.bucketize(trackNulls = false, splits = splits, bucketLabels = bucketLabels) vector.originStage shouldBe a[NumericBucketizer[_]] val buck = vector.originStage.asInstanceOf[NumericBucketizer[_]] - - buck.transform(data1).collect(vector) shouldBe expectedAns + val transformed = buck.transform(data1) + val actual = transformed.collect(vector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(actual.head.value.size)(true)) + actual shouldBe expectedAns } it should "keep track of null values if wanted (reals) " in new RealTest { val vector = trackNullsRealBucketizer.getOutput() - trackNullsRealBucketizer.transform(data1).collect(vector) shouldBe trackNullsExpectedAns + val transformed = trackNullsRealBucketizer.transform(data1) + val actual = transformed.collect(vector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(actual.head.value.size)(true)) + actual shouldBe trackNullsExpectedAns val expectedMeta = TestOpVectorMetadataBuilder( trackNullsRealBucketizer, @@ -216,12 +232,21 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { trackNullsRealBucketizer2.getOutputFeatureName, trackNullsRealBucketizer2.getMetadata() ) shouldEqual expectedMeta2 + val vector2 = trackNullsRealBucketizer2.getOutput() + val transformed2 = trackNullsRealBucketizer2.transform(data1) + val actual2 = transformed2.collect(vector2) + val field2 = transformed2.schema(vector2.name) + assertNominal(field2, Array.fill(actual2.head.value.size)(true)) } it should "allow right inclusive splits (reals)" in new RealTest { val vector = num.bucketize(trackNulls = false, splits = splitsRightInclusive, splitInclusion = Inclusion.Right) val buck = vector.originStage.asInstanceOf[NumericBucketizer[_]] - buck.transform(data1).collect(vector) shouldBe expectedRightInclusiveAns + val transformed = buck.transform(data1) + val actual = transformed.collect(vector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(actual.head.value.size)(true)) + actual shouldBe expectedRightInclusiveAns } it should "correctly bucketize some random reals" in { @@ -235,9 +260,13 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { splits = Array(Double.NegativeInfinity, 0.0, Double.PositiveInfinity), splitInclusion = Inclusion.Left ) + val bucketizer = buck.originStage + val transformed = bucketizer.asInstanceOf[NumericBucketizer[_]].transform(ds) + val results = transformed.collect(buck) + val field = transformed.schema(buck.name) + assertNominal(field, Array.fill(results.head.value.size)(true)) - buck.originStage shouldBe a[NumericBucketizer[_]] - val results = buck.originStage.asInstanceOf[NumericBucketizer[_]].transform(ds).collect(buck) + bucketizer shouldBe a[NumericBucketizer[_]] val (neg, pos, empty) = (Vectors.dense(1.0, 0.0, 0.0).toOPVector, @@ -256,7 +285,8 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { ) val buck = num.bucketize(trackNulls = true, trackInvalid = true, splits = Array(0.0, 1.0, 5.0)) val stage = buck.originStage.asInstanceOf[NumericBucketizer[_]] - val results = stage.transform(ds).collect(buck) + val transformed = stage.transform(ds) + val results = transformed.collect(buck) results shouldBe Seq( Vectors.dense(0.0, 0.0, 0.0, 1.0), @@ -266,6 +296,9 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 0.0, 1.0, 0.0) ).map(_.toOPVector) + val field = transformed.schema(buck.name) + assertNominal(field, Array.fill(results.head.value.size)(true)) + val expectedMeta = TestOpVectorMetadataBuilder( stage, num -> List(IndCol(Some("[0.0-1.0)")), IndCol(Some("[1.0-5.0)")), IndCol(Some(TransmogrifierDefaults.OtherString)), IndCol(Some(TransmogrifierDefaults.NullString)) @@ -276,8 +309,12 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { it should "transform the data correctly (integrals)" in new IntegralTest { val vector = integralBucketizer.getOutput() - integralBucketizer.transform(data1).collect(vector) shouldBe expectedAns + val transformed = integralBucketizer.transform(data1) + val results = transformed.collect(vector) + results shouldBe expectedAns + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(results.head.value.size)(true)) val expectedMeta = TestOpVectorMetadataBuilder( integralBucketizer, num -> List(IndCol(Some("0-1")), IndCol(Some("1-5")), IndCol(Some("5-10")), IndCol(Some("10-Infinity"))) @@ -289,12 +326,20 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { val vector = num.bucketize(trackNulls = false, splits = splits, bucketLabels = bucketLabels) vector.originStage shouldBe a[NumericBucketizer[_]] val buck = vector.originStage.asInstanceOf[NumericBucketizer[_]] - buck.transform(data1).collect(vector) shouldBe expectedAns + val transformed = buck.transform(data1) + val results = transformed.collect(vector) + results shouldBe expectedAns + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(results.head.value.size)(true)) } it should "keep track of null values if wanted (integrals)" in new IntegralTest { val vector = trackNullsIntegralBucketizer.getOutput() - trackNullsIntegralBucketizer.transform(data1).collect(vector) shouldBe trackNullsExpectedAns + val transformed = trackNullsIntegralBucketizer.transform(data1) + val results = transformed.collect(vector) + results shouldBe trackNullsExpectedAns + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(results.head.value.size)(true)) val expectedMeta = TestOpVectorMetadataBuilder( trackNullsIntegralBucketizer, @@ -310,7 +355,12 @@ class NumericBucketizerTest extends FlatSpec with TestSparkContext { it should "allow right inclusive splits (integrals)" in new IntegralTest { val vector = num.bucketize(trackNulls = false, splits = splitsRightInclusive, splitInclusion = Inclusion.Right) val buck = vector.originStage.asInstanceOf[NumericBucketizer[_]] - buck.transform(data1).collect(vector) shouldBe expectedRightInclusiveAns + val transformed = buck.transform(data1) + val results = transformed.collect(vector) + results shouldBe expectedRightInclusiveAns + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(results.head.value.size)(true)) + } } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericVectorizerTest.scala index e6b2c98644..dfe0852bdb 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/NumericVectorizerTest.scala @@ -45,7 +45,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class NumericVectorizerTest extends FlatSpec with FeatureTestBase { +class NumericVectorizerTest extends FlatSpec with FeatureTestBase with AttributeAsserts { val ageData: Seq[Real] = RandomReal.uniform[Real](maxValue = 80.0).limit(100) val heightData: Seq[Real] = RandomReal.normal[Real](mean = 65.0, sigma = 8).limit(100) @@ -75,6 +75,8 @@ class NumericVectorizerTest extends FlatSpec with FeatureTestBase { Array(3.0, 0.0, 0.0, 1.0), Array(4.0, 0.0, 0.0, 1.0) ).map(Vectors.dense(_).toOPVector) + val field = vectorized.schema(autoBucketFeature.name) + assertNominal(field, false +: Array.fill(expected.head.value.size - 1)(true)) vectorized.collect(autoBucketFeature) should contain theSameElementsAs expected } it should "vectorize single real feature with a label" in { @@ -86,7 +88,9 @@ class NumericVectorizerTest extends FlatSpec with FeatureTestBase { age.autoBucketize(labelData, trackNulls = false) ).combine() val vectorized = new OpWorkflow().setResultFeatures(autoBucketFeature, manualBucketFeature).transform(ds) - + val field = vectorized.schema(autoBucketFeature.name) + assertNominal(field, false +: + Array.fill(vectorized.collect(autoBucketFeature).head.value.size - 1)(true)) for {(autoAge, manualAge) <- vectorized.collect(autoBucketFeature, manualBucketFeature)} { autoAge.v.toArray should contain theSameElementsAs manualAge.v.toArray } @@ -101,7 +105,9 @@ class NumericVectorizerTest extends FlatSpec with FeatureTestBase { height, height.autoBucketize(labelData, trackNulls = false) ).transmogrify() val vectorized = new OpWorkflow().setResultFeatures(autoBucketFeature, manualBucketFeature).transform(ds) - + val field = vectorized.schema(autoBucketFeature.name) + assertNominal(field, Array(false, true, false) ++ + Array.fill(vectorized.collect(autoBucketFeature).head.value.size - 3)(true)) for {(autoAge, manualAge) <- vectorized.collect(autoBucketFeature, manualBucketFeature)} { autoAge.v.toArray should contain theSameElementsAs manualAge.v.toArray } @@ -118,6 +124,12 @@ class NumericVectorizerTest extends FlatSpec with FeatureTestBase { val autoBucketFeature = Seq(count).transmogrify(label = Some(labelData)) val manualBucketFeature = Seq(count, count.autoBucketize(labelData, trackNulls = false)).transmogrify() val vectorized = new OpWorkflow().setResultFeatures(autoBucketFeature, manualBucketFeature).transform(ds) + val field = vectorized.schema(autoBucketFeature.name) + assertNominal(field, false +: + Array.fill(vectorized.collect(autoBucketFeature).head.value.size - 1)(true)) + val field2 = vectorized.schema(manualBucketFeature.name) + assertNominal(field2, false +: + Array.fill(vectorized.collect(manualBucketFeature).head.value.size - 1)(true)) for {(autoAge, manualAge) <- vectorized.collect(autoBucketFeature, manualBucketFeature)} { autoAge.v.toArray should contain theSameElementsAs manualAge.v.toArray diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizerTest.scala index 5b90b286c5..53a9de18cc 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizerTest.scala @@ -41,7 +41,7 @@ import org.scalatest.junit.JUnitRunner import org.scalatest.{Assertions, FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) -class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { +class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { val (catData, top, bot) = TestFeatureBuilder("top", "bot", Seq[(MultiPickList, MultiPickList)]( @@ -104,6 +104,8 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val vector = vectorizer.getOutput() val transformed = vectorizer.transform(catData) val result = transformed.collect(vector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(false)) vectorizer.isSharedHashSpace shouldBe false @@ -139,6 +141,8 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val vector = vectorizer.getOutput() val transformed = vectorizer.transform(catData) val result = transformed.collect(vector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(false)) vectorizer.isSharedHashSpace shouldBe false @@ -176,6 +180,8 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val vector = vectorizer.getOutput() val transformed = vectorizer.transform(textListData) val result = transformed.collect(vector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(false)) vectorizer.isSharedHashSpace shouldBe false @@ -206,6 +212,8 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val vector = vectorizer.getOutput() val transformed = vectorizer.transform(realMapData) val result = transformed.collect(vector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(false)) vectorizer.isSharedHashSpace shouldBe false @@ -231,6 +239,8 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val vector = vectorizer.getOutput() val transformed = vectorizer.transform(catData) val result = transformed.collect(vector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(false)) vectorizer.isSharedHashSpace shouldBe true @@ -264,6 +274,8 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val vector = vectorizer.getOutput() val transformed = vectorizer.transform(textListData) val result = transformed.collect(vector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(false)) vectorizer.isSharedHashSpace shouldBe true @@ -294,6 +306,7 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val vector = vectorizer.getOutput() val transformed = vectorizer.transform(realMapData) val result = transformed.collect(vector) + // TODO : Find a way to recognize hashed RealMap has Categoricals vectorizer.isSharedHashSpace shouldBe true @@ -319,6 +332,7 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val vector = vectorizer.getOutput() val transformed = vectorizer.transform(realMapData) val result = transformed.collect(vector) + // TODO : Find a way to recognize hashed RealMap has Categoricals vectorizer.isSharedHashSpace shouldBe true @@ -341,6 +355,8 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val vector = vectorizer.getOutput() val transformed = vectorizer.transform(textListData) val result = transformed.collect(vector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(false)) vectorizer.isSharedHashSpace shouldBe false @@ -371,6 +387,8 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { val vector = vectorizer.getOutput() val transformed = vectorizer.transform(textListData) val result = transformed.collect(vector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(false)) vectorizer.isSharedHashSpace shouldBe true @@ -398,6 +416,9 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { .setNumFeatures(10).setHashSpaceStrategy(HashSpaceStrategy.Separate) val feature = vectorizer.getOutput() val transformed = vectorizer.transform(catData) + val vector = vectorizer.getOutput() + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(false)) val meta = OpVectorMetadata(transformed.schema(feature.name)) meta.history.keys shouldBe Set(top.name, bot.name) meta.columns.length shouldBe 20 @@ -412,6 +433,9 @@ class OPCollectionHashingVectorizerTest extends FlatSpec with TestSparkContext { .setNumFeatures(10).setHashSpaceStrategy(HashSpaceStrategy.Shared) val feature = vectorizer.getOutput() val transformed = vectorizer.transform(catData) + val vector = vectorizer.getOutput() + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(false)) val meta = OpVectorMetadata(transformed.schema(feature.name)) meta.history.keys shouldBe Set(top.name, bot.name) meta.columns.length shouldBe 10 diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala index eee1dd3521..13a9bffe4d 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OPMapVectorizerTest.scala @@ -52,7 +52,7 @@ import scala.reflect.runtime.universe._ @RunWith(classOf[JUnitRunner]) -class OPMapVectorizerTest extends FlatSpec with TestSparkContext { +class OPMapVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { import OPMapVectorizerTestHelper._ @@ -83,7 +83,10 @@ class OPMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(14, Array(0, 2, 11), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) - transformed.collect(vectorizer.getOutput()) shouldBe expected + val output = vectorizer.getOutput() + val field = transformed.schema(output.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) + transformed.collect(output) shouldBe expected fitted.getMetadata() shouldBe transformed.schema.fields(2).metadata } @@ -122,7 +125,7 @@ class OPMapVectorizerTest extends FlatSpec with TestSparkContext { val currencyData3: Seq[Currency] = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0) .withProbabilityOfEmpty(0.5).limit(1000) - testFeatureToMap[Currency, CurrencyMap, Double](currencyData, currencyData2, currencyData3) + testFeatureToMap[Currency, CurrencyMap, Double](currencyData, currencyData2, currencyData3, isCategorical = false) } "Date features" should "be vectorized the same whether they're in maps or not" in { @@ -135,7 +138,7 @@ class OPMapVectorizerTest extends FlatSpec with TestSparkContext { val dateData3: Seq[Date] = RandomIntegral.dates(init = new JDate(1500000000000L), minStep = minSec, maxStep = maxSec).withProbabilityOfEmpty(0.5).limit(1000) - testFeatureToMap[Date, DateMap, Long](dateData, dateData2, dateData3) + testFeatureToMap[Date, DateMap, Long](dateData, dateData2, dateData3, false) } "DateTime features" should "be vectorized the same whether they're in maps or not" in { @@ -148,7 +151,7 @@ class OPMapVectorizerTest extends FlatSpec with TestSparkContext { val dateTimeData3: Seq[DateTime] = RandomIntegral.datetimes(init = new JDate(), minStep = minSec, maxStep = maxSec) .withProbabilityOfEmpty(0.5).limit(1000) - testFeatureToMap[DateTime, DateTimeMap, Long](dateTimeData, dateTimeData2, dateTimeData3) + testFeatureToMap[DateTime, DateTimeMap, Long](dateTimeData, dateTimeData2, dateTimeData3, false) } "Email features" should "be vectorized the same whether they're in maps or not" in { @@ -178,7 +181,7 @@ class OPMapVectorizerTest extends FlatSpec with TestSparkContext { val integralData3: Seq[Integral] = RandomIntegral.integrals(from = -100L, to = 100L) .withProbabilityOfEmpty(0.5).limit(1000) - testFeatureToMap[Integral, IntegralMap, Long](integralData, integralData2, integralData3) + testFeatureToMap[Integral, IntegralMap, Long](integralData, integralData2, integralData3, false) } "MultiPickList features" should "be vectorized the same whether they're in maps or not" in { @@ -197,7 +200,7 @@ class OPMapVectorizerTest extends FlatSpec with TestSparkContext { val percentData2: Seq[Percent] = RandomReal.uniform[Percent]().withProbabilityOfEmpty(0.5).limit(1000) val percentData3: Seq[Percent] = RandomReal.uniform[Percent]().withProbabilityOfEmpty(0.5).limit(1000) - testFeatureToMap[Percent, PercentMap, Double](percentData, percentData2, percentData3) + testFeatureToMap[Percent, PercentMap, Double](percentData, percentData2, percentData3, false) } // TODO: Fix failing test @@ -228,7 +231,7 @@ class OPMapVectorizerTest extends FlatSpec with TestSparkContext { val realData2: Seq[Real] = RandomReal.uniform[Real]().withProbabilityOfEmpty(0.5).limit(1000) val realData3: Seq[Real] = RandomReal.uniform[Real]().withProbabilityOfEmpty(0.5).limit(1000) - testFeatureToMap[Real, RealMap, Double](realData, realData2, realData3) + testFeatureToMap[Real, RealMap, Double](realData, realData2, realData3, false) } "TextArea features" should "be vectorized the same whether they're in maps or not" in { @@ -239,7 +242,7 @@ class OPMapVectorizerTest extends FlatSpec with TestSparkContext { val textAreaData3: Seq[TextArea] = RandomText.textAreas(minLen = 5, maxLen = 10) .withProbabilityOfEmpty(0.5).limit(1000) - testFeatureToMap[TextArea, TextAreaMap, String](textAreaData, textAreaData2, textAreaData3) + testFeatureToMap[TextArea, TextAreaMap, String](textAreaData, textAreaData2, textAreaData3, false) } "Text features" should "be vectorized the same whether they're in maps or not" in { @@ -247,7 +250,7 @@ class OPMapVectorizerTest extends FlatSpec with TestSparkContext { val textData2: Seq[Text] = RandomText.strings(minLen = 5, maxLen = 10).withProbabilityOfEmpty(0.5).limit(1000) val textData3: Seq[Text] = RandomText.strings(minLen = 5, maxLen = 10).withProbabilityOfEmpty(0.5).limit(1000) - testFeatureToMap[Text, TextMap, String](textData, textData2, textData3) + testFeatureToMap[Text, TextMap, String](textData, textData2, textData3, false) } "URL features" should "be vectorized the same whether they're in maps or not" in { @@ -317,11 +320,12 @@ class OPMapVectorizerTest extends FlatSpec with TestSparkContext { val GeolocationData2: Seq[Geolocation] = RandomList.ofGeolocations.limit(1000) val GeolocationData3: Seq[Geolocation] = RandomList.ofGeolocations.limit(1000) - testFeatureToMap[Geolocation, GeolocationMap, Seq[Double]](GeolocationData, GeolocationData2, GeolocationData3) + testFeatureToMap[Geolocation, GeolocationMap, Seq[Double]](GeolocationData, GeolocationData2, GeolocationData3, + false) } } -object OPMapVectorizerTestHelper extends Matchers { +object OPMapVectorizerTestHelper extends Matchers with AttributeAsserts { val log = LoggerFactory.getLogger(this.getClass) @@ -330,15 +334,17 @@ object OPMapVectorizerTestHelper extends Matchers { * corresponds to its own key in the OPMap feature. This is used to test whether base feature types are vectorized * the same as their corresponding map types. * - * @param f1Data Sequence of base feature type data (eg. from generators) - * @param f2Data Sequence of base feature type data (eg. from generators) - * @param f3Data Sequence of base feature type data (eg. from generators) + * @param f1Data Sequence of base feature type data (eg. from generators) + * @param f2Data Sequence of base feature type data (eg. from generators) + * @param f3Data Sequence of base feature type data (eg. from generators) + * @param isCategorical If the vector contains categoricals * @tparam F Base feature type (eg. ID, Text, Integer) * @tparam FM OPMap feature type (eg. IDMap, TextMap, IntegerMap) * @tparam MT Value type of map inside OPMap feature (eg. String, String, Int) */ def testFeatureToMap[F <: FeatureType : TypeTag, FM <: OPMap[MT] : TypeTag, MT: TypeTag] - (f1Data: Seq[F], f2Data: Seq[F], f3Data: Seq[F])(implicit spark: SparkSession): Unit = { + (f1Data: Seq[F], f2Data: Seq[F], f3Data: Seq[F], isCategorical: Boolean = true)(implicit spark: SparkSession): + Unit = { val generatedData: Seq[(F, F, F)] = f1Data.zip(f2Data).zip(f3Data).map { case ((f1, f2), f3) => (f1, f2, f3) } val (rawDF, rawF1, rawF2, rawF3) = TestFeatureBuilder("f1", "f2", "f3", generatedData) @@ -359,6 +365,21 @@ object OPMapVectorizerTestHelper extends Matchers { log.info("transformed data:") transformed.show(10) } + val isCategoricalArray = if (isCategorical) { + Array.fill(transformed.collect(featureVector).head.value.size)(true) + } else { + rawF1 match { + case f if f.isSubtypeOf[Date] => Array.fill(24)(false) ++ Array.fill(3)(Seq(false, true)).flatten + .asInstanceOf[Array[Boolean]] + case f if f.isSubtypeOf[TextArea] || f.isSubtypeOf[Text] => Array.fill( + transformed.collect(featureVector).head.value.size - 3)(false) ++ Array.fill(3)(true) + case f if f.isSubtypeOf[Geolocation] => Array.fill(transformed.collect(featureVector).head.value.size / 4)( + Seq(false, false, false, true)).flatten + case _ => Array.fill(transformed.collect(featureVector).head.value.size / 2)(Seq(false, true)).flatten + } + } + val field = transformed.schema(featureVector.name) + assertNominal(field, isCategoricalArray) val summary = transformed.schema(featureVector.name).metadata log.info("summary:\n{}", summary) @@ -374,6 +395,9 @@ object OPMapVectorizerTestHelper extends Matchers { log.info("transformedMap:") transformedMap.show(10) } + val fieldMap = transformedMap.schema(mapFeatureVector.name) + assertNominal(fieldMap, isCategoricalArray) + // Check that the actual features are the same val vectorizedBaseFeatures = transformed.collect(featureVector) @@ -459,6 +483,7 @@ object OPMapVectorizerTestHelper extends Matchers { case _ => Map.empty } } + val mapData = asMap(f1, "f1") ++ asMap(f2, "f2") ++ asMap(f3, "f3") ftFactory.newInstance(mapData) } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OpSetVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OpSetVectorizerTest.scala index 197ee442d5..2d6eb82811 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/OpSetVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/OpSetVectorizerTest.scala @@ -47,7 +47,7 @@ import org.slf4j.LoggerFactory @RunWith(classOf[JUnitRunner]) -class OpSetVectorizerTest extends FlatSpec with TestSparkContext { +class OpSetVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { val log = LoggerFactory.getLogger(this.getClass) @@ -115,6 +115,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(10, Array(1, 5, 6), Array(1.0, 1.0, 1.0)), Vectors.sparse(10, Array(0, 1, 7), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected fitted.getMetadata() shouldBe transformed.schema.fields(2).metadata } @@ -129,6 +131,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(13, Array(4, 7, 8), Array(1.0, 1.0, 1.0)), Vectors.sparse(13, Array(1, 2, 9), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected val vectorMetadata = fitted.getMetadata() val expectedMeta = TestOpVectorMetadataBuilder( @@ -150,6 +154,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { val fitted = vectorizer.setCleanText(true).setTopK(1).fit(dataSet) val transformed = fitted.transform(dataSet) val vector = vectorizer.getOutput() + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) transformed.collect(vector) shouldBe expectedData vectorizer.setTopK(10) } @@ -164,6 +170,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 1.0, 0.0, 2.0, 0.0), Vectors.dense(1.0, 1.0, 0.0, 1.0, 0.0) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) } it should "return a vector with elements only in the other & null columns and not throw errors when passed data" + @@ -176,6 +184,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(1.0, 0.0, 0.0, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 1.0, 0.0, 1.0) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected val vectorMetadata = fitted.getMetadata() val expectedMeta = TestOpVectorMetadataBuilder( @@ -208,6 +218,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(1.0, 0.0, 0.0, 0.0), Vectors.dense(0.0, 0.0, 0.0, 0.0) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected val vectorMetadata = fitted.getMetadata() val expectedMeta = TestOpVectorMetadataBuilder( @@ -231,6 +243,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { val vector = fitted.getOutput() val transformed = fitted.transform(dataSetAllEmpty) val expected = Array(Vectors.dense(0.0, 1.0), Vectors.dense(0.0, 1.0), Vectors.dense(0.0, 1.0)).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected val vectorMetadata = fitted.getMetadata() val expectedMeta = TestOpVectorMetadataBuilder( @@ -247,7 +261,10 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { .asInstanceOf[Transformer].transform(dataSet) result.originStage shouldBe a[OpSetVectorizer[_]] - df.collect(result) shouldBe expectedData + val actual = df.collect(result) + actual shouldBe expectedData + val field = df.schema(result.name) + assertNominal(field, Array.fill(actual.head.value.size)(true)) } it should "expand number of columns for picklist features by two (one for other & one for null)" in { @@ -285,6 +302,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { val fitted = localVectorizer.fit(localDataSet) val transformed = fitted.transform(localDataSet) val vector = localVectorizer.getOutput() + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(transformed.collect(vector).head.value.size)(true)) } it should "process multiple columns of PickList using the vectorize shortcut" in { @@ -304,6 +323,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { val inputDF = TestOpWorkflowBuilder(localDF, vectorized).computeDataUpTo(vectorized) val transformed = new OpWorkflow().setResultFeatures(vectorized).transform(localDF) + val field = transformed.schema(vectorized.name) + assertNominal(field, Array.fill(transformed.collect(vectorized).head.value.size)(true)) val metaMap = transformed.metadata(vectorized) log.info(metaMap.toString) @@ -326,6 +347,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { val inputDF = TestOpWorkflowBuilder(localDF, vectorized).computeDataUpTo(vectorized) val transformed = new OpWorkflow().setResultFeatures(vectorized).transform(localDF) + val field = transformed.schema(vectorized.name) + assertNominal(field, Array.fill(transformed.collect(vectorized).head.value.size)(true)) val metaMap = transformed.metadata(vectorized) log.info(metaMap.toString) @@ -349,6 +372,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { val res = f2.transformWith[OPVector](stage = oPSetVectorizer.setTopK(3), Array.empty[FeatureLike[MultiPickList]]) val transformed = new OpWorkflow().setResultFeatures(res).transform(localDF) + val field = transformed.schema(res.name) + assertNominal(field, Array.fill(transformed.collect(res).head.value.size)(true)) } it should "process multiple columns of numerics, PickLists, and MultiPickLists using the vectorize shortcut" in { @@ -369,6 +394,8 @@ class OpSetVectorizerTest extends FlatSpec with TestSparkContext { val inputDF = TestOpWorkflowBuilder(localDF, vectorized).computeDataUpTo(vectorized) val transformed = new OpWorkflow().setResultFeatures(vectorized).transform(localDF) + val field = transformed.schema(vectorized.name) + assertNominal(field, Array(true, true, true, true, false, true)) val metaMap = transformed.metadata(vectorized) log.info(metaMap.toString) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealMapVectorizerTest.scala index f334370509..ff46122226 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealMapVectorizerTest.scala @@ -44,7 +44,8 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class RealMapVectorizerTest - extends OpEstimatorSpec[OPVector, SequenceModel[RealMap, OPVector], RealMapVectorizer[RealMap]] { + extends OpEstimatorSpec[OPVector, SequenceModel[RealMap, OPVector], RealMapVectorizer[RealMap]] + with AttributeAsserts { val (inputData, m1, m2) = TestFeatureBuilder("m1", "m2", Seq( @@ -92,7 +93,8 @@ class RealMapVectorizerTest val vectorizer = estimator.setDefaultValue(0.0).setTrackNulls(false).fit(inputData) val transformed = vectorizer.transform(inputData) val vector = vectorizer.getOutput() - + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expectedResult.head.value.size)(false)) transformed.collect(vector) shouldBe expectedResult transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() @@ -108,7 +110,8 @@ class RealMapVectorizerTest Vectors.sparse(12, Array(1, 3, 4, 8, 11), Array(1.0, 1.0, 11.0, 3.0, 1.0)), Vectors.sparse(12, Array(1, 3, 5, 7, 9, 11), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) - + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size / 2)(Seq(false, true)).flatten) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMetaTrackNulls val vectorMetadata = vectorizer.getMetadata() @@ -124,7 +127,8 @@ class RealMapVectorizerTest Vectors.dense(Array(100.0, 100.0, 11.0, 0.0, 3.0, 100.0)), Vectors.dense(Array(100.0, 100.0, 100.0, 100.0, 100.0, 100.0)) ).map(_.toOPVector) - + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expectedResult.head.value.size)(false)) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() @@ -141,7 +145,8 @@ class RealMapVectorizerTest Vectors.dense(Array(100.0, 1.0, 100.0, 1.0, 11.0, 0.0, 0.0, 0.0, 3.0, 0.0, 100.0, 1.0)), Vectors.dense(Array(100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0, 100.0, 1.0)) ).map(_.toOPVector) - + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size / 2)(Seq(false, true)).flatten) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMetaTrackNulls val vectorMetadata = vectorizer.getMetadata() @@ -162,7 +167,8 @@ class RealMapVectorizerTest vectorizer, m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(None, "B")), m2 -> List(IndColWithGroup(None, "Z"))) - + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(false)) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() @@ -186,7 +192,8 @@ class RealMapVectorizerTest IndColWithGroup(None, "B"), IndColWithGroup(nullIndicatorValue, "B")), m2 -> List(IndColWithGroup(None, "Z"), IndColWithGroup(nullIndicatorValue, "Z")) ) - + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size / 2)(Seq(false, true)).flatten) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() @@ -208,8 +215,8 @@ class RealMapVectorizerTest m1 -> List(IndColWithGroup(None, "B"), IndColWithGroup(None, "C")), m2 -> List(IndColWithGroup(None, "X"), IndColWithGroup(None, "Y")) ) - - transformed.collect(vector) shouldBe expected + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(false)) transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual expectedMeta @@ -233,7 +240,8 @@ class RealMapVectorizerTest m2 -> List(IndColWithGroup(None, "X"), IndColWithGroup(nullIndicatorValue, "X"), IndColWithGroup(None, "Y"), IndColWithGroup(nullIndicatorValue, "Y")) ) - + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size / 2)(Seq(false, true)).flatten) transformed.collect(vector) shouldBe expected transformed.schema.toOpVectorMetadata(vectorizer.getOutputFeatureName) shouldEqual expectedMeta val vectorMetadata = vectorizer.getMetadata() @@ -251,7 +259,8 @@ class RealMapVectorizerTest Vectors.dense(-1.0, 4.0, 11.0, 0.0, 1.0, 5.0), Vectors.dense(-1.0, 4.0, 11.0, 0.0, 8.0 / 3, 15.0 / 2) ).map(_.toOPVector) - + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expectedResult.head.value.size)(false)) transformed.collect(vector) shouldBe expected val expectedMeta = TestOpVectorMetadataBuilder( @@ -275,7 +284,8 @@ class RealMapVectorizerTest Vectors.dense(-1.0, 1.0, 4.0, 1.0, 11.0, 1.0, 0.0, 0.0, 1.0, 0.0, 5.0, 0.0), Vectors.dense(-1.0, 1.0, 4.0, 1.0, 11.0, 1.0, 0.0, 1.0, 8.0 / 3, 1.0, 15.0 / 2, 1.0) ).map(_.toOPVector) - + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size / 2)(Seq(false, true)).flatten) transformed.collect(vector) shouldBe expected val expectedMetaTrackNulls = TestOpVectorMetadataBuilder( diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealVectorizerTest.scala index 02a2b123aa..850cfa9653 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/RealVectorizerTest.scala @@ -43,7 +43,7 @@ import org.scalatest.{Assertions, FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) -class RealVectorizerTest extends FlatSpec with TestSparkContext { +class RealVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { val (testData, inA, inB, inC) = TestFeatureBuilder("inA", "inB", "inC", Seq[(Real, Real, Real)]( @@ -106,7 +106,8 @@ class RealVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, null, Vectors.dense(4.2, 2.0, 4.2)), (2.0, null, null, Vectors.dense(2.0, 4.2, 4.2)) ) - + val field = testDataTransformedConstant.schema(testModelConstant.getOutputFeatureName) + assertNominal(field, Array.fill(expectedZero.head._4.size)(false)) transformedValuesConstant.map(_.get(0)) shouldEqual expectedZero.map(_._1) transformedValuesConstant.map(_.get(1)) shouldEqual expectedZero.map(_._2) transformedValuesConstant.map(_.get(2)) shouldEqual expectedZero.map(_._3) @@ -132,7 +133,8 @@ class RealVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, null, Vectors.dense(3.0, 2.0, 0.0)), (2.0, null, null, Vectors.dense(2.0, 2.0, 0.0)) ) - + val field = testDataTransformedMean.schema(testModelMean.getOutputFeatureName) + assertNominal(field, Array.fill(expectedMean.head._4.size)(false)) transformedValuesMean.map(_.get(0)) shouldEqual expectedMean.map(_._1) transformedValuesMean.map(_.get(1)) shouldEqual expectedMean.map(_._2) transformedValuesMean.map(_.get(2)) shouldEqual expectedMean.map(_._3) @@ -154,7 +156,8 @@ class RealVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, null, Vectors.dense(0.0, 1.0, 2.0, 0.0, 0.0, 1.0)), (2.0, null, null, Vectors.dense(2.0, 0.0, 0.0, 1.0, 0.0, 1.0)) ) - + val field = testDataTransformedConstantTracked.schema(testModelConstantTracked.getOutputFeatureName) + assertNominal(field, Array.fill(expectedZeroTracked.head._4.size / 2)(Seq(false, true)).flatten) transformedValuesZeroTracked.map(_.get(0)) shouldEqual expectedZeroTracked.map(_._1) transformedValuesZeroTracked.map(_.get(1)) shouldEqual expectedZeroTracked.map(_._2) transformedValuesZeroTracked.map(_.get(2)) shouldEqual expectedZeroTracked.map(_._3) @@ -187,7 +190,8 @@ class RealVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, null, Vectors.dense(3.0, 1.0, 2.0, 0.0, 0.0, 1.0)), (2.0, null, null, Vectors.dense(2.0, 0.0, 2.0, 1.0, 0.0, 1.0)) ) - + val field = testDataTransformedMeanTracked.schema(testModelMeanTracked.getOutputFeatureName) + assertNominal(field, Array.fill(expectedMeanTracked.head._4.size / 2)(Seq(false, true)).flatten) transformedValuesMeanTracked.map(_.get(0)) shouldEqual expectedMeanTracked.map(_._1) transformedValuesMeanTracked.map(_.get(1)) shouldEqual expectedMeanTracked.map(_._2) transformedValuesMeanTracked.map(_.get(2)) shouldEqual expectedMeanTracked.map(_._3) @@ -220,7 +224,8 @@ class RealVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, null, Vectors.dense(3.0, 1.0, 2.0, 0.0, 0.0, 1.0)), (2.0, null, null, Vectors.dense(2.0, 0.0, 2.0, 1.0, 0.0, 1.0)) ) - + val field = testDataTransformedMeanTracked.schema(testModelMeanTracked.getOutputFeatureName) + assertNominal(field, Array.fill(expectedMeanTracked.head._4.size / 2)(Seq(false, true)).flatten) transformedValuesMeanTracked.map(_.get(0)) shouldEqual expectedMeanTracked.map(_._1) transformedValuesMeanTracked.map(_.get(1)) shouldEqual expectedMeanTracked.map(_._2) transformedValuesMeanTracked.map(_.get(2)) shouldEqual expectedMeanTracked.map(_._3) @@ -252,7 +257,8 @@ class RealVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, null, Vectors.dense(4.2, 2.0, 4.2)), (2.0, null, null, Vectors.dense(2.0, 4.2, 4.2)) ) - + val field = testDataTransformedConstant.schema(testModelConstant.getOutputFeatureName) + assertNominal(field, Array.fill(expectedZero.head._4.size)(false)) transformedValuesConstant.map(_.get(0)) shouldEqual expectedZero.map(_._1) transformedValuesConstant.map(_.get(1)) shouldEqual expectedZero.map(_._2) transformedValuesConstant.map(_.get(2)) shouldEqual expectedZero.map(_._3) @@ -273,7 +279,8 @@ class RealVectorizerTest extends FlatSpec with TestSparkContext { (null, 2.0, null, Vectors.dense(4.2, 2.0, 4.2)), (2.0, null, null, Vectors.dense(2.0, 4.2, 4.2)) ) - + val field = testDataTransformedConstant.schema(testModelConstant.getOutputFeatureName) + assertNominal(field, Array.fill(expectedZero.head._4.size)(false)) transformedValuesConstant.map(_.get(0)) shouldEqual expectedZero.map(_._1) transformedValuesConstant.map(_.get(1)) shouldEqual expectedZero.map(_._2) transformedValuesConstant.map(_.get(2)) shouldEqual expectedZero.map(_._3) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala index 1b48336def..618c1d9921 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala @@ -42,7 +42,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { +class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { lazy val (data, m1, m2, f1, f2) = TestFeatureBuilder("textMap1", "textMap2", "text1", "text2", Seq[(TextMap, TextMap, Text, Text)]( (TextMap(Map("text1" -> "hello world", "text2" -> "Hello world!")), TextMap.empty, @@ -105,13 +105,16 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) val result = transformed.collect(smartMapVectorized, smartVectorized) - + val field = transformed.schema(smartVectorized.name) + assertNominal(field, Array.fill(4)(true) ++ Array.fill(4)(false) :+ true) + val fieldMap = transformed.schema(smartMapVectorized.name) + assertNominal(fieldMap, Array.fill(4)(true) ++ Array.fill(4)(false) :+ true) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) mapMeta.history.keys shouldBe Set(m1.name, m2.name) mapMeta.columns.length shouldBe meta.columns.length - mapMeta.columns.zip(meta.columns).foreach{ case (m, f) => + mapMeta.columns.zip(meta.columns).foreach { case (m, f) => m.parentFeatureName shouldBe Array(m1.name) m.parentFeatureType shouldBe Array(m1.typeName) if (m.index < 4) m.grouping shouldBe f.grouping @@ -119,7 +122,7 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { m.indicatorValue shouldBe f.indicatorValue } - result.foreach{ case (vec1, vec2) => vec1 shouldBe vec2} + result.foreach { case (vec1, vec2) => vec1 shouldBe vec2 } } it should "detect two categorical text features" in { @@ -134,20 +137,24 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) val result = transformed.collect(smartMapVectorized, smartVectorized) - + val field = transformed.schema(smartVectorized.name) + assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + val fieldMap = transformed.schema(smartMapVectorized.name) + assertNominal(fieldMap, + Array.fill(transformed.collect(smartMapVectorized).head.value.size)(true)) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) mapMeta.history.keys shouldBe Set(m1.name, m2.name) mapMeta.columns.length shouldBe meta.columns.length - mapMeta.columns.zip(meta.columns).foreach{ case (m, f) => + mapMeta.columns.zip(meta.columns).foreach { case (m, f) => m.parentFeatureName shouldBe Array(m1.name) m.parentFeatureType shouldBe Array(m1.typeName) m.grouping shouldBe f.grouping m.indicatorValue shouldBe f.indicatorValue } - result.foreach{ case (vec1, vec2) => vec1 shouldBe vec2} + result.foreach { case (vec1, vec2) => vec1 shouldBe vec2 } } it should "use separate hash space for each text feature" in { @@ -164,13 +171,16 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) val result = transformed.collect(smartMapVectorized, smartVectorized) - + val field = transformed.schema(smartVectorized.name) + assertNominal(field, Array.fill(8)(false) ++ Array(true, true)) + val fieldMap = transformed.schema(smartMapVectorized.name) + assertNominal(fieldMap, Array.fill(8)(false) ++ Array(true, true)) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) mapMeta.history.keys shouldBe Set(m1.name, m2.name) mapMeta.columns.length shouldBe meta.columns.length - mapMeta.columns.zip(meta.columns).foreach{ case (m, f) => + mapMeta.columns.zip(meta.columns).foreach { case (m, f) => m.parentFeatureName shouldBe Array(m1.name) m.parentFeatureType shouldBe Array(m1.typeName) if (m.index < 4 || m.index == 8) m.grouping shouldBe Option(f1.name) @@ -178,7 +188,7 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { m.indicatorValue shouldBe f.indicatorValue } - result.foreach{ case (vec1, vec2) => vec1 shouldBe vec2} + result.foreach { case (vec1, vec2) => vec1 shouldBe vec2 } } it should "use shared hash space for two text features" in { @@ -195,13 +205,16 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) val result = transformed.collect(smartMapVectorized, smartVectorized) - + val field = transformed.schema(smartVectorized.name) + assertNominal(field, Array.fill(4)(false) ++ Array(true, true)) + val fieldMap = transformed.schema(smartMapVectorized.name) + assertNominal(fieldMap, Array.fill(4)(false) ++ Array(true, true)) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) mapMeta.history.keys shouldBe Set(m1.name, m2.name) mapMeta.columns.length shouldBe meta.columns.length - mapMeta.columns.zip(meta.columns).foreach{ case (m, f) => + mapMeta.columns.zip(meta.columns).foreach { case (m, f) => m.parentFeatureName shouldBe Array(m1.name) m.parentFeatureType shouldBe Array(m1.typeName) if (m.index == 4) { @@ -212,7 +225,7 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { m.indicatorValue shouldBe f.indicatorValue } - result.foreach{ case (vec1, vec2) => vec1 shouldBe vec2} + result.foreach { case (vec1, vec2) => vec1 shouldBe vec2 } } it should "use shared hash space for two text features again" in { @@ -229,13 +242,18 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, smartVectorized).transform(data) val result = transformed.collect(smartMapVectorized, smartVectorized) - + val field = transformed.schema(smartVectorized.name) + assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size - 2)(false) ++ + Array(true, true)) + val fieldMap = transformed.schema(smartMapVectorized.name) + assertNominal(fieldMap, Array.fill(transformed.collect(smartVectorized).head.value.size - 2)(false) ++ + Array(true, true)) val mapMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) mapMeta.history.keys shouldBe Set(m1.name, m2.name) mapMeta.columns.length shouldBe meta.columns.length - mapMeta.columns.zip(meta.columns).foreach{ case (m, f) => + mapMeta.columns.zip(meta.columns).foreach { case (m, f) => m.parentFeatureName shouldBe Array(m1.name) m.parentFeatureType shouldBe Array(m1.typeName) if (m.index == TransmogrifierDefaults.MaxNumOfFeatures) { @@ -246,7 +264,7 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { m.indicatorValue shouldBe f.indicatorValue } - result.foreach{ case (vec1, vec2) => vec1 shouldBe vec2} + result.foreach { case (vec1, vec2) => vec1 shouldBe vec2 } } it should "product the same result for shortcut" in { @@ -264,20 +282,23 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(smartMapVectorized, shortcutMapVectorized).transform(data) val result = transformed.collect(smartMapVectorized, shortcutMapVectorized) - + val field = transformed.schema(shortcutMapVectorized.name) + assertNominal(field, Array.fill(4)(true) ++ Array.fill(4)(false) :+ true) + val fieldMap = transformed.schema(smartMapVectorized.name) + assertNominal(fieldMap, Array.fill(4)(true) ++ Array.fill(4)(false) :+ true) val smartMeta = OpVectorMetadata(transformed.schema(smartMapVectorized.name)) val shortcutMeta = OpVectorMetadata(transformed.schema(shortcutMapVectorized.name)) smartMeta.history.keys shouldBe shortcutMeta.history.keys smartMeta.columns.length shouldBe shortcutMeta.columns.length - smartMeta.columns.zip(shortcutMeta.columns).foreach{ case (smart, shortcut) => + smartMeta.columns.zip(shortcutMeta.columns).foreach { case (smart, shortcut) => smart.parentFeatureName shouldBe shortcut.parentFeatureName smart.parentFeatureType shouldBe shortcut.parentFeatureType smart.grouping shouldBe shortcut.grouping smart.indicatorValue shouldBe shortcut.indicatorValue } - result.foreach{ case (vec1, vec2) => vec1 shouldBe vec2} + result.foreach { case (vec1, vec2) => vec1 shouldBe vec2 } } it should "work on textarea map fields" in { @@ -293,13 +314,16 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(textMapVectorized, textAreaMapVectorized).transform(data) val result = transformed.collect(textMapVectorized, textAreaMapVectorized) - + val field = transformed.schema(textMapVectorized.name) + assertNominal(field, Array.fill(4)(true) ++ Array.fill(4)(false) :+ true) + val fieldMap = transformed.schema(textAreaMapVectorized.name) + assertNominal(fieldMap, Array.fill(4)(true) ++ Array.fill(4)(false) :+ true) val textMapMeta = OpVectorMetadata(transformed.schema(textMapVectorized.name)) val textareaMapMeta = OpVectorMetadata(transformed.schema(textAreaMapVectorized.name)) textMapMeta.history.keys shouldBe textareaMapMeta.history.keys textMapMeta.columns.length shouldBe textareaMapMeta.columns.length - textMapMeta.columns.zip(textareaMapMeta.columns).foreach{ case (textMap, textareaMap) => + textMapMeta.columns.zip(textareaMapMeta.columns).foreach { case (textMap, textareaMap) => textMap.parentFeatureName shouldBe textareaMap.parentFeatureName textMap.parentFeatureType shouldBe Array("com.salesforce.op.features.types.TextMap") textareaMap.parentFeatureType shouldBe Array("com.salesforce.op.features.types.TextAreaMap") @@ -307,6 +331,6 @@ class SmartTextMapVectorizerTest extends FlatSpec with TestSparkContext { textMap.indicatorValue shouldBe textareaMap.indicatorValue } - result.foreach{ case (vec1, vec2) => vec1 shouldBe vec2} + result.foreach { case (vec1, vec2) => vec1 shouldBe vec2 } } } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala index 34ef01af0b..dd974ff446 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala @@ -43,7 +43,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class SmartTextVectorizerTest - extends OpEstimatorSpec[OPVector, SequenceModel[Text, OPVector], SmartTextVectorizer[Text]] { + extends OpEstimatorSpec[OPVector, SequenceModel[Text, OPVector], SmartTextVectorizer[Text]] with AttributeAsserts { lazy val (inputData, f1, f2) = TestFeatureBuilder("text1", "text2", Seq[(Text, Text)]( @@ -82,7 +82,14 @@ class SmartTextVectorizerTest val transformed = new OpWorkflow() .setResultFeatures(smartVectorized, categoricalVectorized, textVectorized, nullIndicator).transform(inputData) val result = transformed.collect(smartVectorized, categoricalVectorized, textVectorized, nullIndicator) - + val field = transformed.schema(smartVectorized.name) + assertNominal(field, Array.fill(4)(true) ++ Array.fill(4)(false) :+ true) + val fieldCategorical = transformed.schema(categoricalVectorized.name) + assertNominal(fieldCategorical, + Array.fill(transformed.collect(categoricalVectorized).head.value.size)(true)) + val fieldText = transformed.schema(textVectorized.name) + assertNominal(fieldText, + Array.fill(transformed.collect(textVectorized).head.value.size)(false)) val (smart, expected) = result.map { case (smartVector, categoricalVector, textVector, nullVector) => val combined = VectorsCombiner.combineOP(Seq(categoricalVector, textVector, nullVector)) smartVector -> combined @@ -101,7 +108,11 @@ class SmartTextVectorizerTest val transformed = new OpWorkflow().setResultFeatures(smartVectorized, categoricalVectorized).transform(inputData) val result = transformed.collect(smartVectorized, categoricalVectorized) - + val field = transformed.schema(smartVectorized.name) + assertNominal(field, Array.fill(transformed.collect(smartVectorized).head.value.size)(true)) + val fieldCategorical = transformed.schema(categoricalVectorized.name) + assertNominal(fieldCategorical, + Array.fill(transformed.collect(categoricalVectorized).head.value.size)(true)) val (smart, expected) = result.unzip smart shouldBe expected @@ -121,7 +132,10 @@ class SmartTextVectorizerTest val transformed = new OpWorkflow() .setResultFeatures(smartVectorized, textVectorized, nullIndicator).transform(inputData) val result = transformed.collect(smartVectorized, textVectorized, nullIndicator) - + val field = transformed.schema(smartVectorized.name) + assertNominal(field, Array.fill(8)(false) ++ Array(true, true)) + val fieldText = transformed.schema(textVectorized.name) + assertNominal(fieldText, Array.fill(transformed.collect(textVectorized).head.value.size)(false)) val (smart, expected) = result.map { case (smartVector, textVector, nullVector) => val combined = VectorsCombiner.combineOP(Seq(textVector, nullVector)) smartVector -> combined @@ -144,7 +158,10 @@ class SmartTextVectorizerTest val transformed = new OpWorkflow().setResultFeatures(smartVectorized, shortcutVectorized).transform(inputData) val result = transformed.collect(smartVectorized, shortcutVectorized) - + val field = transformed.schema(smartVectorized.name) + assertNominal(field, Array.fill(4)(true) ++ Array.fill(4)(false) :+ true) + val fieldShortcut = transformed.schema(shortcutVectorized.name) + assertNominal(fieldShortcut, Array.fill(4)(true) ++ Array.fill(4)(false) :+ true) val (regular, shortcut) = result.unzip regular shouldBe shortcut diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextListNullTransformerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextListNullTransformerTest.scala index 0c69e74888..4373e04bf9 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextListNullTransformerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextListNullTransformerTest.scala @@ -42,7 +42,7 @@ import org.scalatest.{Assertions, FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) -class TextListNullTransformerTest extends FlatSpec with TestSparkContext { +class TextListNullTransformerTest extends FlatSpec with TestSparkContext with AttributeAsserts { val (ds, f1, f2) = TestFeatureBuilder( Seq[(TextList, TextList)]( @@ -83,6 +83,8 @@ class TextListNullTransformerTest extends FlatSpec with TestSparkContext { Array(0.0, 1.0), Array(1.0, 1.0) ).map(Vectors.dense(_).toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected val vectorMetadata = vectorizer.getMetadata() diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimatorTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimatorTest.scala index 913df46e9d..d7f81c2a46 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimatorTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapNullEstimatorTest.scala @@ -42,7 +42,7 @@ import org.scalatest.{Assertions, FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) -class TextMapNullEstimatorTest extends FlatSpec with TestSparkContext { +class TextMapNullEstimatorTest extends FlatSpec with TestSparkContext with AttributeAsserts { val (ds, f1) = TestFeatureBuilder( Seq[(TextMap)]( @@ -74,7 +74,8 @@ class TextMapNullEstimatorTest extends FlatSpec with TestSparkContext { Array(1.0, 1.0, 0.0, 1.0) ).map(Vectors.dense(_).toOPVector) transformed.collect(vector) shouldBe expected - + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) val vectorMetadata = vectorizer.getMetadata() OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual TestOpVectorMetadataBuilder( vectorizer, diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapVectorizerTest.scala index 4137a2c0cd..6fd001eb8b 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextMapVectorizerTest.scala @@ -44,7 +44,7 @@ import org.slf4j.LoggerFactory @RunWith(classOf[JUnitRunner]) -class TextMapVectorizerTest extends FlatSpec with TestSparkContext { +class TextMapVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { val log = LoggerFactory.getLogger(classOf[TextMapVectorizerTest]) @@ -92,17 +92,17 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { val vectorMetadata = fitted.getMetadata() OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual TestOpVectorMetadataBuilder(vectorizer, - top -> List( - IndColWithGroup(Some("D"), "C"), IndColWithGroup(Some("OTHER"), "C"), IndColWithGroup(Some("D"), "A"), - IndColWithGroup(Some("E"), "A"), IndColWithGroup(Some("OTHER"), "A"), - IndColWithGroup(Some("D"), "B"), IndColWithGroup(Some("OTHER"), "B") - ), - bot -> List( - IndColWithGroup(Some("W"), "X"), IndColWithGroup(Some("OTHER"), "X"), IndColWithGroup(Some("V"), "Y"), - IndColWithGroup(Some("OTHER"), "Y"), IndColWithGroup(Some("V"), "Z"), - IndColWithGroup(Some("W"), "Z"), IndColWithGroup(Some("OTHER"), "Z") + top -> List( + IndColWithGroup(Some("D"), "C"), IndColWithGroup(Some("OTHER"), "C"), IndColWithGroup(Some("D"), "A"), + IndColWithGroup(Some("E"), "A"), IndColWithGroup(Some("OTHER"), "A"), + IndColWithGroup(Some("D"), "B"), IndColWithGroup(Some("OTHER"), "B") + ), + bot -> List( + IndColWithGroup(Some("W"), "X"), IndColWithGroup(Some("OTHER"), "X"), IndColWithGroup(Some("V"), "Y"), + IndColWithGroup(Some("OTHER"), "Y"), IndColWithGroup(Some("V"), "Z"), + IndColWithGroup(Some("W"), "Z"), IndColWithGroup(Some("OTHER"), "Z") + ) ) - ) fitted.getInputFeatures() shouldBe Array(top, bot) fitted.parent shouldBe vectorizer } @@ -118,7 +118,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(14, Array(0, 7, 9), Array(1.0, 1.0, 1.0)), Vectors.sparse(14, Array(0, 2, 11), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) - + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected fitted.getMetadata() shouldBe transformed.schema.fields(2).metadata } @@ -134,7 +135,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(20, Array(0, 6, 9, 10, 13, 19), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)), Vectors.sparse(20, Array(0, 3, 9, 12, 15, 16), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) - + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected fitted.getMetadata() shouldBe transformed.schema.fields(2).metadata } @@ -150,21 +152,23 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(17, Array(0, 9, 11), Array(1.0, 1.0, 1.0)), Vectors.sparse(17, Array(1, 3, 14), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual TestOpVectorMetadataBuilder(vectorizer, - top -> List( - IndColWithGroup(Some("D"), "c"), IndColWithGroup(Some("d"), "c"), IndColWithGroup(Some("OTHER"), "c"), - IndColWithGroup(Some("d"), "a"), IndColWithGroup(Some("e"), "a"), - IndColWithGroup(Some("OTHER"), "a"), IndColWithGroup(Some("d"), "b"), IndColWithGroup(Some("OTHER"), "b") - ), - bot -> List( - IndColWithGroup(Some("W"), "x"), IndColWithGroup(Some("w"), "x"), IndColWithGroup(Some("OTHER"), "x"), - IndColWithGroup(Some("V"), "y"), IndColWithGroup(Some("v"), "y"), - IndColWithGroup(Some("OTHER"), "y"), IndColWithGroup(Some("v"), "z"), IndColWithGroup(Some("w"), "z"), - IndColWithGroup(Some("OTHER"), "z") + top -> List( + IndColWithGroup(Some("D"), "c"), IndColWithGroup(Some("d"), "c"), IndColWithGroup(Some("OTHER"), "c"), + IndColWithGroup(Some("d"), "a"), IndColWithGroup(Some("e"), "a"), + IndColWithGroup(Some("OTHER"), "a"), IndColWithGroup(Some("d"), "b"), IndColWithGroup(Some("OTHER"), "b") + ), + bot -> List( + IndColWithGroup(Some("W"), "x"), IndColWithGroup(Some("w"), "x"), IndColWithGroup(Some("OTHER"), "x"), + IndColWithGroup(Some("V"), "y"), IndColWithGroup(Some("v"), "y"), + IndColWithGroup(Some("OTHER"), "y"), IndColWithGroup(Some("v"), "z"), IndColWithGroup(Some("w"), "z"), + IndColWithGroup(Some("OTHER"), "z") + ) ) - ) } it should "track nulls when clean text is set to false" in { @@ -178,23 +182,25 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(23, Array(0, 7, 10, 12, 15, 22), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)), Vectors.sparse(23, Array(1, 4, 10, 14, 18, 19), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected OpVectorMetadata(vectorizer.getOutputFeatureName, vectorMetadata) shouldEqual TestOpVectorMetadataBuilder(vectorizer, - top -> List( - IndColWithGroup(Some("D"), "c"), IndColWithGroup(Some("d"), "c"), IndColWithGroup(Some("OTHER"), "c"), - IndColWithGroup(nullIndicatorValue, "c"), IndColWithGroup(Some("d"), "a"), IndColWithGroup(Some("e"), "a"), - IndColWithGroup(Some("OTHER"), "a"), IndColWithGroup(nullIndicatorValue, "a"), - IndColWithGroup(Some("d"), "b"), IndColWithGroup(Some("OTHER"), "b"), IndColWithGroup(nullIndicatorValue, "b") - ), - bot -> List( - IndColWithGroup(Some("W"), "x"), IndColWithGroup(Some("w"), "x"), IndColWithGroup(Some("OTHER"), "x"), - IndColWithGroup(nullIndicatorValue, "x"), IndColWithGroup(Some("V"), "y"), IndColWithGroup(Some("v"), "y"), - IndColWithGroup(Some("OTHER"), "y"), IndColWithGroup(nullIndicatorValue, "y"), - IndColWithGroup(Some("v"), "z"), IndColWithGroup(Some("w"), "z"), - IndColWithGroup(Some("OTHER"), "z"), IndColWithGroup(nullIndicatorValue, "z") + top -> List( + IndColWithGroup(Some("D"), "c"), IndColWithGroup(Some("d"), "c"), IndColWithGroup(Some("OTHER"), "c"), + IndColWithGroup(nullIndicatorValue, "c"), IndColWithGroup(Some("d"), "a"), IndColWithGroup(Some("e"), "a"), + IndColWithGroup(Some("OTHER"), "a"), IndColWithGroup(nullIndicatorValue, "a"), + IndColWithGroup(Some("d"), "b"), IndColWithGroup(Some("OTHER"), "b"), IndColWithGroup(nullIndicatorValue, "b") + ), + bot -> List( + IndColWithGroup(Some("W"), "x"), IndColWithGroup(Some("w"), "x"), IndColWithGroup(Some("OTHER"), "x"), + IndColWithGroup(nullIndicatorValue, "x"), IndColWithGroup(Some("V"), "y"), IndColWithGroup(Some("v"), "y"), + IndColWithGroup(Some("OTHER"), "y"), IndColWithGroup(nullIndicatorValue, "y"), + IndColWithGroup(Some("v"), "z"), IndColWithGroup(Some("w"), "z"), + IndColWithGroup(Some("OTHER"), "z"), IndColWithGroup(nullIndicatorValue, "z") + ) ) - ) } it should "return only the specified number of elements when top K is set" in { @@ -208,6 +214,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(12, Array(0, 6, 8), Array(1.0, 1.0, 1.0)), Vectors.sparse(12, Array(0, 2, 10), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -222,6 +230,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(18, Array(0, 5, 8, 9, 12, 17), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)), Vectors.sparse(18, Array(0, 3, 8, 11, 14, 15), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -234,6 +244,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(10, Array(0, 5, 7), Array(1.0, 1.0, 1.0)), Vectors.sparse(10, Array(0, 2, 9), Array(1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -246,6 +258,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(16, Array(0, 5, 7, 8, 11, 15), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)), Vectors.sparse(16, Array(0, 3, 7, 10, 13, 14), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(vector) shouldBe expected } @@ -259,6 +273,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 1.0, 0.0, 0.0, 0.0), Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(fitted.getOutput()) shouldBe expected val transformed2 = fitted.transform(dataSet) @@ -268,6 +284,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0), Vectors.dense(1.0, 0.0, 0.0, 0.0, 0.0) ).map(_.toOPVector) + val field2 = transformed2.schema(vector.name) + assertNominal(field2, Array.fill(expected.head.value.size)(true)) transformed2.collect(fitted.getOutput()) shouldBe expected2 } @@ -281,6 +299,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(fitted.getOutput()) shouldBe expected val transformed2 = fitted.transform(dataSet) @@ -290,6 +310,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0), Vectors.dense(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0) ).map(_.toOPVector) + val field2 = transformed2.schema(vector.name) + assertNominal(field2, Array.fill(expected.head.value.size)(true)) transformed2.collect(fitted.getOutput()) shouldBe expected2 } @@ -302,6 +324,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.dense(Array.empty[Double]), Vectors.dense(Array.empty[Double]) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(fitted.getOutput()) shouldBe expected } @@ -316,6 +340,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(5, Array(3), Array(1.0)), Vectors.sparse(5, Array(0), Array(1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(fitted.getOutput()) shouldBe expected } @@ -330,6 +356,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(7, Array(3, 4), Array(1.0, 1.0)), Vectors.sparse(7, Array(0, 6), Array(1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(fitted.getOutput()) shouldBe expected } @@ -345,6 +373,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(9, Array(0, 7), Array(1.0, 1.0)), Vectors.sparse(9, Array(0, 4), Array(1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(fitted.getOutput()) shouldBe expected } @@ -360,6 +390,8 @@ class TextMapVectorizerTest extends FlatSpec with TestSparkContext { Vectors.sparse(13, Array(0, 5, 9, 10), Array(1.0, 1.0, 1.0, 1.0)), Vectors.sparse(13, Array(0, 5, 6, 12), Array(1.0, 1.0, 1.0, 1.0)) ).map(_.toOPVector) + val field = transformed.schema(vector.name) + assertNominal(field, Array.fill(expected.head.value.size)(true)) transformed.collect(fitted.getOutput()) shouldBe expected } } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTransmogrifyTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTransmogrifyTest.scala index 78ca407f2d..5310d4c317 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTransmogrifyTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextTransmogrifyTest.scala @@ -43,7 +43,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class TextTransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest { +class TextTransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest with AttributeAsserts { val cityData: Seq[City] = RandomText.cities.take(10).toList val countryData: Seq[Country] = RandomText.countries.take(10).toList @@ -55,7 +55,7 @@ class TextTransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest { val data: Seq[(City, Country, PostalCode, Text, TextArea)] = cityData.zip(countryData).zip(postalData).zip(textData).zip(textAreaData) - .map{ case ((((ci, co), p), t), ta) => (ci, co, p, t, ta) } + .map { case ((((ci, co), p), t), ta) => (ci, co, p, t, ta) } val (ds, city, country, postal, text, textarea) = TestFeatureBuilder("city", "country", "postal", "text", "textarea", data) @@ -96,7 +96,9 @@ class TextTransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest { val feature = Seq(largeText, largeTextarea).transmogrify() val vectorized = new OpWorkflow().setResultFeatures(feature).transform(largeDS) val vectCollect = vectorized.collect(feature) - + val field = vectorized.schema(feature.name) + val array = Array.fill(vectCollect.head.value.size / 2 - 1)(false) :+ true + assertNominal(field, array ++ array) for {vector <- vectCollect} { vector.v.size shouldBe TransmogrifierDefaults.DefaultNumOfFeatures * 2 + 2 } @@ -109,7 +111,10 @@ class TextTransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest { val feature2 = phone.vectorize("US") val vectorized = new OpWorkflow().setResultFeatures(feature, feature2).transform(ds) val vectCollect = vectorized.collect(feature, feature2) - + val field = vectorized.schema(feature.name) + assertNominal(field, Array.fill(vectCollect.head._1.value.size)(true)) + val field2 = vectorized.schema(feature2.name) + assertNominal(field2, Array.fill(vectCollect.head._2.value.size)(true)) for {(vector1, vector2) <- vectCollect} { vector1.v.size shouldBe 2 vector1.v.toArray should contain theSameElementsAs vector2.v.toArray @@ -122,7 +127,8 @@ class TextTransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest { val feature = Seq(text).transmogrify() val vectorized = new OpWorkflow().setResultFeatures(feature).transform(ds) val vectCollect = vectorized.collect(feature) - + val field = vectorized.schema(feature.name) + assertNominal(field, Array.fill(vectCollect.head.value.size)(true)) vectCollect.forall(_.value.size == TransmogrifierDefaults.DefaultNumOfFeatures + 1) } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextVectorizerTest.scala index 500a7859a6..9c82508e7c 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TextVectorizerTest.scala @@ -40,7 +40,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class TextVectorizerTest extends FlatSpec with TestSparkContext { +class TextVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts { // scalastyle:off lazy val (data, f1, f2) = TestFeatureBuilder( Seq[(Text, Text)]( @@ -63,7 +63,8 @@ class TextVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(vectorized).transform(data) val result = transformed.collect(vectorized) val f1NameHash = hasher.indexOf(vectorized.parents.head.originStage.getInputFeatures().head.name) - + val field = transformed.schema(vectorized.name) + assertNominal(field, Array.fill(result.head.value.size - 1)(false) :+ true) // scalastyle:off result(0).value(hasher.indexOf(s"${f1NameHash}_" + "hamlet")) should be >= 1.0 result(0).value(hasher.indexOf(s"${f1NameHash}_" + "question")) should be >= 1.0 @@ -86,7 +87,8 @@ class TextVectorizerTest extends FlatSpec with TestSparkContext { val transformed = new OpWorkflow().setResultFeatures(vectorized).transform(data) val result = transformed.collect(vectorized) val f1NameHash = hasher.indexOf(vectorized.parents.head.originStage.getInputFeatures().head.name) - + val field = transformed.schema(vectorized.name) + assertNominal(field, Array.fill(result.head.value.size - 2)(false) ++ Array(true, true)) // scalastyle:off result(0).value(hasher.indexOf(s"${f1NameHash}_" + "hamlet")) shouldBe 1.0 result(0).value(hasher.indexOf(s"${f1NameHash}_" + "hamlet")) shouldBe 1.0 diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TransmogrifyTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TransmogrifyTest.scala index 202be7f5c7..eec59642d4 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/TransmogrifyTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/TransmogrifyTest.scala @@ -42,7 +42,7 @@ import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) -class TransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest { +class TransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest with AttributeAsserts { val inputFeatures = Array[OPFeature](heightNoWindow, weight, gender) @@ -64,11 +64,10 @@ class TransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest { val feature = inputFeatures.toSeq.transmogrify() val model = new OpWorkflow().setResultFeatures(feature).setReader(dataReader).train() val transformed = model.score(keepRawFeatures = true, keepIntermediateFeatures = true) - val hist = feature.parents.flatMap{ f => + val hist = feature.parents.flatMap { f => val h = f.history() h.originFeatures.map(o => o -> FeatureHistory(Seq(o), h.stages)) }.toMap - transformed.schema.toOpVectorMetadata(feature.name) shouldEqual TestOpVectorMetadataBuilder.withOpNamesAndHist( feature.originStage, @@ -92,6 +91,8 @@ class TransmogrifyTest extends FlatSpec with PassengerSparkFixtureTest { List(1.0, 0.0, 363.0, 0.0, 172.0, 0.0), List(1.0, 0.0, 186.0, 0.0, 96.0, 0.0) ) + val field = transformed.schema(feature.name) + assertNominal(field, Array(true, true, false, true, false, true)) } } diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/URLVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/URLVectorizerTest.scala index 666929eb00..3fb4ef616b 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/URLVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/URLVectorizerTest.scala @@ -45,7 +45,8 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class URLVectorizerTest - extends FlatSpec with FeatureTestBase with RichTextFeature with RichMapFeature with RichFeature { + extends FlatSpec with FeatureTestBase with RichTextFeature with RichMapFeature with RichFeature + with AttributeAsserts { val urlKey = "Url1" val urlKey2 = "Url2" val urls = (RandomText.urlsOn(_ => "salesforce.com").take(2) ++ RandomText.urlsOn(_ => "data.com").take(2)).toSeq @@ -79,8 +80,13 @@ class URLVectorizerTest Vectors.dense(1.0, 0.0, 0.0, 0.0) ).map(_.toOPVector) - def transformAndCollect(ds: DataFrame, feature: FeatureLike[OPVector]): Array[OPVector] = - new OpWorkflow().setResultFeatures(feature).transform(ds).collect(feature) + def transformAndCollect(ds: DataFrame, feature: FeatureLike[OPVector]): Array[OPVector] = { + val transformed = new OpWorkflow().setResultFeatures(feature).transform(ds) + val results = transformed.collect(feature) + val field = transformed.schema(feature.name) + assertNominal(field, Array.fill(results.head.value.size)(true)) + results + } Spec[RichURLMapFeature] should "vectorize UrlMaps correctly" in { val (ds1, f1) = TestFeatureBuilder(urls.map(e => Map(urlKey -> e.value.get).toURLMap)) diff --git a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala index 5c693f5e3a..cec8233bde 100644 --- a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala +++ b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala @@ -31,6 +31,8 @@ package com.salesforce.op.utils.spark import com.salesforce.op.FeatureHistory +import com.salesforce.op.features.types.{FeatureType, _} +import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NumericAttribute} import org.apache.spark.ml.linalg.SQLDataTypes._ import org.apache.spark.sql.types.{Metadata, MetadataBuilder, StructField} @@ -38,8 +40,9 @@ import org.apache.spark.sql.types.{Metadata, MetadataBuilder, StructField} * Represents a metadata wrapper that includes parent feature information. * * The metadata includes a columns field that describes the columns in the vector. - * @param name name of the feature vector - * @param col information about each element in the vector + * + * @param name name of the feature vector + * @param col information about each element in the vector * @param history history of parent features used to create the vector map is from * OpVectorColumnMetadata.parentFeatureName (String) to FeatureHistory */ @@ -53,7 +56,7 @@ class OpVectorMetadata private /** * Column metadata with indicies fixed to match order passed in */ - val columns: Array[OpVectorColumnMetadata] = col.zipWithIndex.map{ case(c, i) => c.copy(index = i) } + val columns: Array[OpVectorColumnMetadata] = col.zipWithIndex.map { case (c, i) => c.copy(index = i) } /** * Get the number of columns in vectors of this type @@ -72,6 +75,7 @@ class OpVectorMetadata private newColumns: Array[OpVectorColumnMetadata] ): OpVectorMetadata = OpVectorMetadata(name, newColumns, history) + val categoricalTypes = Seq(FeatureType.typeName[Binary], FeatureType.typeName[BinaryMap]) /** * Serialize to spark metadata @@ -82,12 +86,18 @@ class OpVectorMetadata private val groupedCol = columns .groupBy(c => (c.parentFeatureName, c.parentFeatureType, c.grouping, c.indicatorValue, c.descriptorValue)) val colData = groupedCol.toSeq - .map{ case (_, g) => g.head -> g.map(_.index) } - val colMeta = colData.map{ case (c, i) => c.toMetadata(i) } - new MetadataBuilder() + .map { case (_, g) => g.head -> g.map(_.index) } + val colMeta = colData.map { case (c, i) => c.toMetadata(i) } + val meta = new MetadataBuilder() .putMetadataArray(OpVectorMetadata.ColumnsKey, colMeta.toArray) .putMetadata(OpVectorMetadata.HistoryKey, FeatureHistory.toMetadata(history)) .build() + val attributes = columns.map { + case c if c.indicatorValue.isDefined || categoricalTypes.exists(c.parentFeatureType.contains) => + BinaryAttribute.defaultAttr.withName(c.makeColName()).withIndex(c.index) + case c => NumericAttribute.defaultAttr.withName(c.makeColName()).withIndex(c.index) + } + new AttributeGroup(name, attributes).toMetadata(meta) } /** @@ -102,10 +112,11 @@ class OpVectorMetadata private /** * Extract the full history for each element of the vector + * * @return Sequence of [[OpVectorColumnHistory]] */ def getColumnHistory(): Seq[OpVectorColumnHistory] = { - columns.map{ c => + columns.map { c => val hist = c.parentFeatureName.map(pn => history.getOrElse(pn, throw new RuntimeException(s"Parent feature name '${pn}' has no associated history"))) val histComb = hist.head.merge(hist.tail: _*) @@ -161,6 +172,7 @@ class OpVectorMetadata private } object OpVectorMetadata { + import com.salesforce.op.utils.spark.RichMetadata._ val ColumnsKey = "vector_columns"