Skip to content

Commit

Permalink
Removed an old test I added to check if Spark ran out of memory when …
Browse files Browse the repository at this point in the history
…calculating a correlation matrix (this is unnecessary and unhelpful) (#160)
  • Loading branch information
Jauntbox authored and tovbinm committed Oct 22, 2018
1 parent 5e762c3 commit 0fcfccf
Showing 1 changed file with 1 addition and 31 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -532,37 +532,7 @@ class SanityCheckerTest extends OpEstimatorSpec[OPVector, BinaryModel[RealNN, OP
featuresToDrop, featuresWithNaNCorr)
}

// TODO: Not sure if we should do this test since it may not fail if spark settings are changed
it should "fail (due to a Kryo buffer overflow) when calculating a large (5k x 5k) correlation matrix " in {
val numHashes = 5000

val vectorized = textMap.vectorize(
shouldPrependFeatureName = TransmogrifierDefaults.PrependFeatureName,
cleanText = false,
cleanKeys = TransmogrifierDefaults.CleanKeys,
others = Array.empty,
trackNulls = TransmogrifierDefaults.TrackNulls,
numHashes = numHashes
)

val checkedFeatures = new SanityChecker()
.setCheckSample(1.0)
.setRemoveBadFeatures(true)
.setRemoveFeatureGroup(true)
.setProtectTextSharedHash(true)
.setFeatureLabelCorrOnly(false)
.setMinCorrelation(0.0)
.setMaxCorrelation(0.8)
.setMaxCramersV(0.8)
.setInput(targetResponse, vectorized)
.getOutput()

checkedFeatures.originStage shouldBe a[SanityChecker]

intercept[SparkException](new OpWorkflow().setResultFeatures(vectorized, checkedFeatures).transform(textData))
}

it should "not fail when calculating feature-label correlations on that same 5k element feature vector" in {
it should "not fail when calculating feature-label correlations on a 5k element feature vector" in {
val numHashes = 5000

val vectorized = textMap.vectorize(
Expand Down

0 comments on commit 0fcfccf

Please sign in to comment.