salesforce · MWYang · Oct 7, 2019 · Oct 7, 2019 · Oct 7, 2019 · Oct 7, 2019
@@ -235,6 +235,8 @@ trait RichTextFeature {
       hashSpaceStrategy: HashSpaceStrategy = TransmogrifierDefaults.HashSpaceStrategy,
       defaultLanguage: Language = TextTokenizer.DefaultLanguage,
       hashAlgorithm: HashAlgorithm = TransmogrifierDefaults.HashAlgorithm,
+      detectSensitive: Boolean = false,
+      removeSensitive: Boolean = false,
       others: Array[FeatureLike[T]] = Array.empty
     ): FeatureLike[OPVector] = {
       // scalastyle:on parameter.number
@@ -258,6 +260,8 @@ trait RichTextFeature {
         .setHashSpaceStrategy(hashSpaceStrategy)
         .setHashAlgorithm(hashAlgorithm)
         .setBinaryFreq(binaryFreq)
+        .setDetectSensitive(detectSensitive)
+        .setRemoveSensitive(removeSensitive)
         .getOutput()
     }
 
@@ -434,6 +438,24 @@ trait RichTextFeature {
       toLowercase: Boolean = TextTokenizer.ToLowercase
     ): FeatureLike[Binary] =
       f.transformWith(new SubstringTransformer[T, T2]().setToLowercase(toLowercase), f2)
+
+    /**
+     * Check if feature is actual human names, and if so, return related demographic information
+     *
+     * @param threshold optional, fraction of rows containing names before processing (default = 0.50)
+     * @return NameStats, a custom map that will be empty if no name was found
+     */
+    def identifyIfHumanName(threshold: Double = 0.50): FeatureLike[NameStats] =
+      new HumanNameIdentifier[T]().setThreshold(threshold).setInput(f).getOutput()
+
+    /**
+     * Check if feature is postal codes, and if so, return postal code with lat/long
+     *
+     * @param threshold optional, fraction of rows containing valid postal codes before processing (default = 0.90)
+     * @return PostalCodeMap, will be empty if no name was found
+     */
+    def identifyIfPostalCode(threshold: Double = 0.90): FeatureLike[PostalCodeMap] =
+      new PostalCodeIdentifier[T]().setThreshold(threshold).setInput(f).getOutput()
   }
 
   implicit class RichPhoneFeature(val f: FeatureLike[Phone]) {

@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the copyright holder nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.feature
+
+import com.salesforce.op._
+import com.salesforce.op.features.types._
+import com.salesforce.op.stages.base.unary.{UnaryEstimator, UnaryModel}
+import com.salesforce.op.utils.stages.NameIdentificationFun
+import com.salesforce.op.utils.stages.NameIdentificationUtils.GenderDictionary
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.ml.param.{DoubleParam, IntParam, ParamValidators}
+import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.MetadataBuilder
+
+import scala.reflect.runtime.universe.TypeTag
+
+/**
+ * Unary estimator for identifying whether a single Text column is a name or not. If the column does appear to be a
+ * name, a custom map will be returned that contains the guessed gender for each entry. If the column does not appear
+ * to be a name, then the output will be an empty map.
+ * @param uid           uid for instance
+ * @param operationName unique name of the operation this stage performs
+ * @param tti           type tag for input
+ * @param ttiv          type tag for input value
+ * @tparam T            the FeatureType (subtype of Text) to operate over
+ */
+class HumanNameIdentifier[T <: Text]
+(
+  uid: String = UID[HumanNameIdentifier[T]],
+  operationName: String = "human name identifier"
+)
+(
+  implicit tti: TypeTag[T],
+  override val ttiv: TypeTag[T#Value]
+) extends UnaryEstimator[T, NameStats](
+  uid = uid,
+  operationName = operationName
+) with NameIdentificationFun[T] {
+
+  val defaultThreshold = new DoubleParam(
+    parent = this,
+    name = "defaultThreshold",
+    doc = "default fraction of entries to be names before treating as name",
+    isValid = (value: Double) => {
+      ParamValidators.gt(0.0)(value) && ParamValidators.lt(1.0)(value)
+    }
+  )
+  setDefault(defaultThreshold, 0.50)
+  def setThreshold(value: Double): this.type = set(defaultThreshold, value)
+
+  val countApproxTimeout = new IntParam(
+    parent = this,
+    name = "countApproxTimeout",
+    doc = "how long to wait (in milliseconds) for result of dataset.rdd.countApprox",
+    isValid = (value: Int) => { ParamValidators.gt(0)(value) }
+  )
+  setDefault(countApproxTimeout, 3 * 60 * 1000)
+  def setCountApproxTimeout(value: Int): this.type = set(countApproxTimeout, value)
+
+  def fitFn(dataset: Dataset[T#Value]): HumanNameIdentifierModel[T] = {
+    require(dataset.schema.fieldNames.length == 1, "There is exactly one column in this dataset")
+
+    val column = col(dataset.schema.fieldNames.head)
+    val (predictedNameProb, treatAsName, bestFirstNameIndex) = unaryEstimatorFitFn(
+      dataset, column, $(defaultThreshold), $(countApproxTimeout)
+    )
+
+    // modified from: https://docs.transmogrif.ai/en/stable/developer-guide/index.html#metadata
+    val preExistingMetadata = getMetadata()
+    val metaDataBuilder = new MetadataBuilder().withMetadata(preExistingMetadata)
+    metaDataBuilder.putBoolean("treatAsName", treatAsName)
+    metaDataBuilder.putLong("predictedNameProb", predictedNameProb.toLong)
+    metaDataBuilder.putLong("bestFirstNameIndex", bestFirstNameIndex.getOrElse(-1).toLong)
+    val updatedMetadata = metaDataBuilder.build()
+    setMetadata(updatedMetadata)
+
+    new HumanNameIdentifierModel[T](uid, treatAsName, indexFirstName = bestFirstNameIndex)
+  }
+}
+
+
+class HumanNameIdentifierModel[T <: Text]
+(
+  override val uid: String,
+  val treatAsName: Boolean,
+  val indexFirstName: Option[Int] = None
+)(implicit tti: TypeTag[T])
+  extends UnaryModel[T, NameStats]("human name identifier", uid) with NameIdentificationFun[T] {
+
+  var broadcastGenderDict: Option[Broadcast[GenderDictionary]] = None
+
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    val spark: SparkSession = dataset.sparkSession
+    this.broadcastGenderDict = Some(spark.sparkContext.broadcast(GenderDictionary()))
+    super.transform(dataset)
+  }
+
+  def transformFn: T => NameStats = (input: T) => {
+    transformerFn(treatAsName, indexFirstName, input, this.broadcastGenderDict.get)
+  }
+}
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the copyright holder nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op.stages.impl.feature
+
+import com.salesforce.op._
+import com.salesforce.op.features.types._
+import com.salesforce.op.stages.base.unary.{UnaryEstimator, UnaryModel}
+import com.salesforce.op.utils.text.TextUtils.getBestRegexMatch
+import org.apache.spark.ml.param.{DoubleParam, ParamValidators}
+import org.apache.spark.sql.{Dataset, SparkSession}
+import org.apache.spark.sql.expressions.UserDefinedFunction
+import org.apache.spark.sql.functions._
+import org.apache.spark.util.SparkUtils.averageBoolCol
+
+import scala.collection.mutable
+import scala.io.Source
+import scala.reflect.runtime.universe.TypeTag
+import scala.util.Try
+import scala.util.matching.Regex
+
+trait PostalCodeHelpers {
+  lazy val postalCodeDictionary: mutable.Map[String, (Option[Double], Option[Double])] = {
+    val postalCodeDictionary = collection.mutable.Map.empty[String, (Option[Double], Option[Double])]
+    val dictionaryPath = "/USPostalCodes.txt"
+    val stream = getClass.getResourceAsStream(dictionaryPath)
+    val buffer = Source.fromInputStream(stream)
+    for {row <- buffer.getLines} {
+      val cols = row.split(",").map(_.trim)
+      val code = cols(0)
+      val lat = Try {
+        cols(1).toDouble
+      }.toOption
+      val lng = Try {
+        cols(2).toDouble
+      }.toOption
+      postalCodeDictionary += (code -> (lat, lng))
+    }
+    buffer.close
+    postalCodeDictionary
+  }
+  val patterns: Seq[Regex] = Seq(
+    ".*(\\d{5}).*".r,
+    ".*(\\d{4}).*".r,
+    ".*(\\d{3}).*".r
+  )
+
+  def findBestPostalCodeMatch(s: String): String = {
+    val result = getBestRegexMatch(patterns, s)
+    // Pad result with leading zeros if needed
+    if (result.length < 5) {
+      val numMissingDigits = 5 - result.length
+      (Seq.fill(numMissingDigits)("0") :+ result).mkString("")
+    }
+    else result
+  }
+}
+
+class PostalCodeIdentifier[T <: Text]
+(
+  uid: String = UID[PostalCodeIdentifier[_]],
+  operationName: String = "postal code identifier"
+)
+(
+  implicit tti: TypeTag[T],
+  override val ttiv: TypeTag[T#Value]
+) extends UnaryEstimator[T, PostalCodeMap](
+  uid = uid,
+  operationName = operationName
+) with PostalCodeHelpers {
+  private val spark = SparkSession.builder().getOrCreate()
+  import spark.implicits._
+  // Parameters
+  val defaultThreshold = new DoubleParam(
+    parent = this,
+    name = "defaultThreshold",
+    doc = "default fraction of successful postal code validations before treating as Postal Code",
+    isValid = (value: Double) => {
+      ParamValidators.gt(0.0)(value) && ParamValidators.lt(1.0)(value)
+    }
+  )
+  setDefault(defaultThreshold, 0.90)
+
+  def setThreshold(value: Double): this.type = set(defaultThreshold, value)
+
+  private def checkIfPostalCode: UserDefinedFunction = udf((s: String) => {
+    val matched = findBestPostalCodeMatch(s)
+    matched != "" && (postalCodeDictionary contains matched)
+  }: Boolean)
+
+  def fitFn(dataset: Dataset[Text#Value]): PostalCodeIdentifierModel[T] = {
+    assert(dataset.schema.fieldNames.length == 1)
+    val column = col(dataset.schema.fieldNames.head)
+    if (
+      averageBoolCol(
+        dataset.select(checkIfPostalCode(column).alias(column.toString).as[Boolean]),
+        column
+      ) >= $(defaultThreshold)
+    ) {
+      new PostalCodeIdentifierModel[T](uid, true)
+    } else new PostalCodeIdentifierModel[T](uid, false)
+  }
+}
+
+class PostalCodeIdentifierModel[T <: Text]
+(
+  override val uid: String,
+  val treatAsPostalCode: Boolean
+)(implicit tti: TypeTag[T])
+  extends UnaryModel[T, PostalCodeMap]("postal code identifier", uid)
+    with PostalCodeHelpers {
+  def transformFn: Text => PostalCodeMap = input => {
+    val rawInput = input.value.getOrElse("")
+    val postalCode = findBestPostalCodeMatch(rawInput)
+    if (treatAsPostalCode) {
+      val (latOption, lngOption) = postalCodeDictionary.getOrElse(postalCode, (None, None))
+      (latOption, lngOption) match {
+        case (Some(lat), Some(lng)) =>
+          PostalCodeMap(Map("postalCode" -> postalCode, "lat" -> lat.toString, "lng" -> lng.toString))
+        case _ => PostalCodeMap(Map(postalCode -> "true", "lat" -> "", "lng" -> ""))
+      }
+    }
+    else PostalCodeMap(Map.empty[String, String])
+  }
+}