Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix local scoring with multipicklist features #243

Merged
merged 7 commits into from
Mar 20, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ case object FeatureTypeSparkConverter {

// Sets
case wt if wt <:< weakTypeOf[t.MultiPickList] => (value: Any) =>
if (value == null) FeatureTypeDefaults.MultiPickList.value else value.asInstanceOf[MWrappedArray[String]].toSet
if (value == null) FeatureTypeDefaults.MultiPickList.value else value.asInstanceOf[Seq[String]].toSet
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you would also have to make the same fix for MultiPickListMap


// Everything else
case _ => identity
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,15 @@ package com.salesforce.op.local

import java.nio.file.Paths

import com.salesforce.op.features.Feature
import com.salesforce.op.features.types._
import com.salesforce.op.readers.DataFrameFieldNames._
import com.salesforce.op.stages.impl.classification.BinaryClassificationModelSelector
import com.salesforce.op.stages.impl.classification.BinaryClassificationModelsToTry._
import com.salesforce.op.stages.base.unary.UnaryLambdaTransformer
import com.salesforce.op.stages.impl.classification.{BinaryClassificationModelSelector, OpLogisticRegression}
import com.salesforce.op.stages.impl.classification.BinaryClassificationModelsToTry
import com.salesforce.op.stages.impl.feature.StringIndexerHandleInvalid
import com.salesforce.op.test.{PassengerSparkFixtureTest, TestCommon}
import com.salesforce.op.test.{PassengerSparkFixtureTest, TestCommon, TestFeatureBuilder}
import com.salesforce.op.testkit._
import com.salesforce.op.utils.spark.RichDataset._
import com.salesforce.op.utils.spark.RichRow._
import com.salesforce.op.{OpParams, OpWorkflow}
Expand All @@ -60,7 +63,7 @@ class OpWorkflowRunnerLocalTest extends FlatSpec with PassengerSparkFixtureTest
.indexed(handleInvalid = StringIndexerHandleInvalid.Skip)

val prediction = BinaryClassificationModelSelector.withTrainValidationSplit(
splitter = None, modelTypesToUse = Seq(OpLogisticRegression)
splitter = None, modelTypesToUse = Seq(BinaryClassificationModelsToTry.OpLogisticRegression)
).setInput(survivedNum, features).getOutput()

val workflow = new OpWorkflow().setResultFeatures(prediction, survivedNum, indexed).setReader(dataReader)
Expand Down Expand Up @@ -117,4 +120,47 @@ class OpWorkflowRunnerLocalTest extends FlatSpec with PassengerSparkFixtureTest
} score shouldBe expected
}

it should "handle multi picklist features without throwing an exception" in {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the tests seems an overkill for such a simple check ;) let's think if we can make is simpler and not so labor intensive.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's simply test it in FeatureTypeSparkConverterTest

// First set up the raw features:
val currencyData: Seq[Currency] = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).limit(1000)
val domain = List("Strawberry Milk", "Chocolate Milk", "Soy Milk", "Almond Milk")
val picklistData: Seq[PickList] = RandomText.pickLists(domain).limit(1000)
val domainSize = 20
val maxChoices = 2
val multiPickListData: Seq[MultiPickList] = RandomMultiPickList.of(
RandomText.textFromDomain(domain = List.range(0, domainSize).map(_.toString)), minLen = 0, maxLen = maxChoices
).limit(1000)

// Generate the raw features and corresponding dataframe
val generatedData: Seq[(Currency, MultiPickList, PickList)] =
currencyData.zip(multiPickListData).zip(picklistData).map {
case ((cu, mp), pm) => (cu, mp, pm)
}
val (rawDF, rawCurrency, rawMultiPickList, rawPicklist) =
TestFeatureBuilder("currency", "multipicklist", "picklist", generatedData)

val labelTransformer = new UnaryLambdaTransformer[Currency, RealNN](operationName = "labelFunc",
transformFn = p => if (p.value.exists(_ >= 10.0)) 1.0.toRealNN else 0.0.toRealNN
)

val labelData = labelTransformer.setInput(rawCurrency).getOutput().asInstanceOf[Feature[RealNN]]
.copy(isResponse = true)

val genFeatureVector = Seq(rawCurrency, rawMultiPickList, rawPicklist).transmogrify()

val prediction = new OpLogisticRegression().setInput(labelData, genFeatureVector).getOutput()

val workflow = new OpWorkflow().setResultFeatures(prediction)

val model = workflow.setInputDataset(rawDF).train()

noException should be thrownBy
model.scoreFunction(Map("currency" -> 10.0, "multipicklist" -> Seq("0", "19"), "picklist" -> "Soy Milk"))

noException should be thrownBy
model.scoreFunction(Map("currency" -> 10.0, "multipicklist" -> Nil, "picklist" -> "Soy Milk"))

noException should be thrownBy
model.scoreFunction(Map("currency" -> 10.0, "multipicklist" -> null, "picklist" -> "Soy Milk"))
}
}