-
Notifications
You must be signed in to change notification settings - Fork 394
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix local scoring with multipicklist features #243
Changes from 1 commit
47eb727
0978323
27f5a61
c9a1232
cb8d81b
f4a6718
085bba6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,12 +32,15 @@ package com.salesforce.op.local | |
|
||
import java.nio.file.Paths | ||
|
||
import com.salesforce.op.features.Feature | ||
import com.salesforce.op.features.types._ | ||
import com.salesforce.op.readers.DataFrameFieldNames._ | ||
import com.salesforce.op.stages.impl.classification.BinaryClassificationModelSelector | ||
import com.salesforce.op.stages.impl.classification.BinaryClassificationModelsToTry._ | ||
import com.salesforce.op.stages.base.unary.UnaryLambdaTransformer | ||
import com.salesforce.op.stages.impl.classification.{BinaryClassificationModelSelector, OpLogisticRegression} | ||
import com.salesforce.op.stages.impl.classification.BinaryClassificationModelsToTry | ||
import com.salesforce.op.stages.impl.feature.StringIndexerHandleInvalid | ||
import com.salesforce.op.test.{PassengerSparkFixtureTest, TestCommon} | ||
import com.salesforce.op.test.{PassengerSparkFixtureTest, TestCommon, TestFeatureBuilder} | ||
import com.salesforce.op.testkit._ | ||
import com.salesforce.op.utils.spark.RichDataset._ | ||
import com.salesforce.op.utils.spark.RichRow._ | ||
import com.salesforce.op.{OpParams, OpWorkflow} | ||
|
@@ -60,7 +63,7 @@ class OpWorkflowRunnerLocalTest extends FlatSpec with PassengerSparkFixtureTest | |
.indexed(handleInvalid = StringIndexerHandleInvalid.Skip) | ||
|
||
val prediction = BinaryClassificationModelSelector.withTrainValidationSplit( | ||
splitter = None, modelTypesToUse = Seq(OpLogisticRegression) | ||
splitter = None, modelTypesToUse = Seq(BinaryClassificationModelsToTry.OpLogisticRegression) | ||
).setInput(survivedNum, features).getOutput() | ||
|
||
val workflow = new OpWorkflow().setResultFeatures(prediction, survivedNum, indexed).setReader(dataReader) | ||
|
@@ -117,4 +120,47 @@ class OpWorkflowRunnerLocalTest extends FlatSpec with PassengerSparkFixtureTest | |
} score shouldBe expected | ||
} | ||
|
||
it should "handle multi picklist features without throwing an exception" in { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the tests seems an overkill for such a simple check ;) let's think if we can make is simpler and not so labor intensive. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's simply test it in |
||
// First set up the raw features: | ||
val currencyData: Seq[Currency] = RandomReal.logNormal[Currency](mean = 10.0, sigma = 1.0).limit(1000) | ||
val domain = List("Strawberry Milk", "Chocolate Milk", "Soy Milk", "Almond Milk") | ||
val picklistData: Seq[PickList] = RandomText.pickLists(domain).limit(1000) | ||
val domainSize = 20 | ||
val maxChoices = 2 | ||
val multiPickListData: Seq[MultiPickList] = RandomMultiPickList.of( | ||
RandomText.textFromDomain(domain = List.range(0, domainSize).map(_.toString)), minLen = 0, maxLen = maxChoices | ||
).limit(1000) | ||
|
||
// Generate the raw features and corresponding dataframe | ||
val generatedData: Seq[(Currency, MultiPickList, PickList)] = | ||
currencyData.zip(multiPickListData).zip(picklistData).map { | ||
case ((cu, mp), pm) => (cu, mp, pm) | ||
} | ||
val (rawDF, rawCurrency, rawMultiPickList, rawPicklist) = | ||
TestFeatureBuilder("currency", "multipicklist", "picklist", generatedData) | ||
|
||
val labelTransformer = new UnaryLambdaTransformer[Currency, RealNN](operationName = "labelFunc", | ||
transformFn = p => if (p.value.exists(_ >= 10.0)) 1.0.toRealNN else 0.0.toRealNN | ||
) | ||
|
||
val labelData = labelTransformer.setInput(rawCurrency).getOutput().asInstanceOf[Feature[RealNN]] | ||
.copy(isResponse = true) | ||
|
||
val genFeatureVector = Seq(rawCurrency, rawMultiPickList, rawPicklist).transmogrify() | ||
|
||
val prediction = new OpLogisticRegression().setInput(labelData, genFeatureVector).getOutput() | ||
|
||
val workflow = new OpWorkflow().setResultFeatures(prediction) | ||
|
||
val model = workflow.setInputDataset(rawDF).train() | ||
|
||
noException should be thrownBy | ||
model.scoreFunction(Map("currency" -> 10.0, "multipicklist" -> Seq("0", "19"), "picklist" -> "Soy Milk")) | ||
|
||
noException should be thrownBy | ||
model.scoreFunction(Map("currency" -> 10.0, "multipicklist" -> Nil, "picklist" -> "Soy Milk")) | ||
|
||
noException should be thrownBy | ||
model.scoreFunction(Map("currency" -> 10.0, "multipicklist" -> null, "picklist" -> "Soy Milk")) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you would also have to make the same fix for
MultiPickListMap