Skip to content

Commit

Permalink
Stable key order for map vectorizers (#88)
Browse files Browse the repository at this point in the history
  • Loading branch information
tovbinm committed Aug 24, 2018
1 parent 7135926 commit fb38b67
Show file tree
Hide file tree
Showing 7 changed files with 214 additions and 258 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,6 @@ abstract class OPMapVectorizer[A, T <: OPMap[A]]
val meta = if ($(trackNulls)) makeVectorMetaWithNullIndicators(allKeys) else makeVectorMetadata(allKeys)
setMetadata(meta.toMetadata)


val args = OPMapVectorizerModelArgs(
allKeys = allKeys,
fillByKey = fillByKey(dataset),
Expand Down Expand Up @@ -124,14 +123,14 @@ class IntegralMapVectorizer[T <: OPMap[Long]](uid: String = UID[IntegralMapVecto
def setFillWithMode(shouldFill: Boolean): this.type = set(withConstant, !shouldFill)

override def fillByKey(dataset: Dataset[Seq[T#Value]]): Seq[Map[String, Double]] = {
val size = getInputFeatures().length
val cleanedData = dataset.map(_.map(
cleanMap(_, shouldCleanKey = $(cleanKeys), shouldCleanValue = shouldCleanValues)
))

if ($(withConstant)) Seq.empty
else {
val size = getInputFeatures().length
val modeAggr = SequenceAggregators.ModeSeqMapLong(size = size)
val shouldCleanKeys = $(cleanKeys)
val cleanedData = dataset.map(_.map(
cleanMap(_, shouldCleanKey = shouldCleanKeys, shouldCleanValue = shouldCleanValues)
))
cleanedData.select(modeAggr.toColumn).first()
}.map(convertFn)
}
Expand Down Expand Up @@ -203,8 +202,6 @@ class TextMapHashingVectorizer[T <: OPMap[String]]
def setHashSpaceStrategy(v: HashSpaceStrategy): this.type = set(hashSpaceStrategy, v.entryName)
def getHashSpaceStrategy: HashSpaceStrategy = HashSpaceStrategy.withNameInsensitive($(hashSpaceStrategy))

def getFillByKey(dataset: Dataset[Seq[T#Value]]): Seq[Map[String, Double]] = Seq.empty

def makeModel(args: OPMapVectorizerModelArgs, operationName: String, uid: String): OPMapVectorizerModel[String, T] =
new TextMapHashingVectorizerModel[T](
args = args.copy(shouldCleanValues = $(cleanText)),
Expand All @@ -230,14 +227,14 @@ class RealMapVectorizer[T <: OPMap[Double]](uid: String = UID[RealMapVectorizer[
def setFillWithMean(shouldFill: Boolean): this.type = set(withConstant, !shouldFill)

override def fillByKey(dataset: Dataset[Seq[T#Value]]): Seq[Map[String, Double]] = {
val size = getInputFeatures().length
val cleanedData = dataset.map(_.map(
cleanMap(_, shouldCleanKey = $(cleanKeys), shouldCleanValue = shouldCleanValues)
))

if ($(withConstant)) Seq.empty
else {
val size = getInputFeatures().length
val meanAggr = SequenceAggregators.MeanSeqMapDouble(size = size)
val shouldCleanKeys = $(cleanKeys)
val cleanedData = dataset.map(_.map(
cleanMap(_, shouldCleanKey = shouldCleanKeys, shouldCleanValue = shouldCleanValues)
))
cleanedData.select(meanAggr.toColumn).first()
}
}
Expand Down Expand Up @@ -282,7 +279,7 @@ trait MapVectorizerFuns[A, T <: OPMap[A]] extends VectorizerDefaults with MapPiv
in.map(_.map(kb => filterKeys(kb, shouldCleanKey = shouldCleanKeys, shouldCleanValue = shouldCleanValues).keySet))
.select(sumAggr.toColumn)
.first()
.map(_.toSeq)
.map(_.toSeq.sorted)
}

protected def makeVectorMetadata(allKeys: Seq[Seq[String]]): OpVectorMetadata = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ class SmartTextMapVectorizer[T <: OPMap[String]]

def makeSmartTextMapVectorizerModelArgs(aggregatedStats: Array[TextMapStats]): SmartTextMapVectorizerModelArgs = {
val maxCard = $(maxCardinality)
val minSup = ${minSupport}
val minSup = $(minSupport)
val shouldCleanKeys = $(cleanKeys)
val shouldCleanValues = $(cleanText)
val shouldTrackNulls = $(trackNulls)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@ class BinaryMapVectorizerTest

val estimator = new BinaryMapVectorizer().setTrackNulls(false).setCleanKeys(true).setInput(m1, m2)

val expectedResult: Seq[OPVector] = Seq(
val expectedResult = Seq(
Vectors.sparse(6, Array(1), Array(1.0)),
Vectors.sparse(6, Array(4, 5), Array(1.0, 1.0)),
Vectors.sparse(6, Array(3, 4), Array(1.0, 1.0)),
Vectors.sparse(6, Array(), Array())
).map(_.toOPVector)

Expand All @@ -68,7 +68,7 @@ class BinaryMapVectorizerTest
val expectedMeta = TestOpVectorMetadataBuilder(
estimator,
m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(None, "B"), IndColWithGroup(None, "C")),
m2 -> List(IndColWithGroup(None, "Z"), IndColWithGroup(None, "Y"), IndColWithGroup(None, "X"))
m2 -> List(IndColWithGroup(None, "X"), IndColWithGroup(None, "Y"), IndColWithGroup(None, "Z"))
)

transformed.collect(vector) shouldBe expectedResult
Expand All @@ -82,8 +82,8 @@ class BinaryMapVectorizerTest
val transformed = estimator.setTrackNulls(true).fit(inputData).transform(inputData)
val vector = estimator.getOutput()
val expected = Array(
Vectors.sparse(12, Array(2, 5, 9, 11), Array(1.0, 1.0, 1.0, 1.0)),
Vectors.sparse(12, Array(1, 3, 7, 8, 10), Array(1.0, 1.0, 1.0, 1.0, 1.0)),
Vectors.sparse(12, Array(2, 5, 7, 9), Array(1.0, 1.0, 1.0, 1.0)),
Vectors.sparse(12, Array(1, 3, 6, 8, 11), Array(1.0, 1.0, 1.0, 1.0, 1.0)),
Vectors.sparse(12, Array(1, 3, 5, 7, 9, 11), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0))
).map(_.toOPVector)

Expand All @@ -93,9 +93,9 @@ class BinaryMapVectorizerTest
m1 -> List(IndColWithGroup(None, "A"), IndColWithGroup(nullIndicatorValue, "A"),
IndColWithGroup(None, "B"), IndColWithGroup(nullIndicatorValue, "B"),
IndColWithGroup(None, "C"), IndColWithGroup(nullIndicatorValue, "C")),
m2 -> List(IndColWithGroup(None, "Z"), IndColWithGroup(nullIndicatorValue, "Z"),
m2 -> List(IndColWithGroup(None, "X"), IndColWithGroup(nullIndicatorValue, "X"),
IndColWithGroup(None, "Y"), IndColWithGroup(nullIndicatorValue, "Y"),
IndColWithGroup(None, "X"), IndColWithGroup(nullIndicatorValue, "X"))
IndColWithGroup(None, "Z"), IndColWithGroup(nullIndicatorValue, "Z"))
)

transformed.collect(vector) shouldBe expected
Expand Down
Loading

0 comments on commit fb38b67

Please sign in to comment.