diff --git a/.travis.yml b/.travis.yml
index afac7b8f..5de6600f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -5,10 +5,10 @@ script: mvn ${SPARK} -Dmaven.javadoc.skip=true -B clean compile
jdk: openjdk8
matrix:
include:
- # Covers Spark 2.3.x
+ # Covers Spark 2.4.x + Scala 2.11
- env: SPARK=
- # Covers Spark 2.4.x + Scala 2.12
- - env: SPARK="-Pspark-2.4 -Pscala-2.12"
+ # Covers Spark 3.0.x + Scala 2.12
+ - env: SPARK="-Pspark-3.0 -Pscala-2.12"
cache:
directories:
- $HOME/.m2
diff --git a/ch05-kmeans/src/main/scala/com/cloudera/datascience/kmeans/RunKMeans.scala b/ch05-kmeans/src/main/scala/com/cloudera/datascience/kmeans/RunKMeans.scala
index f29afa7d..0f38ddf6 100644
--- a/ch05-kmeans/src/main/scala/com/cloudera/datascience/kmeans/RunKMeans.scala
+++ b/ch05-kmeans/src/main/scala/com/cloudera/datascience/kmeans/RunKMeans.scala
@@ -107,7 +107,7 @@ class RunKMeans(private val spark: SparkSession) {
val pipeline = new Pipeline().setStages(Array(assembler, kmeans))
val kmeansModel = pipeline.fit(data).stages.last.asInstanceOf[KMeansModel]
- kmeansModel.computeCost(assembler.transform(data)) / data.count()
+ kmeansModel.summary.trainingCost
}
def clusteringScore1(data: DataFrame, k: Int): Double = {
@@ -126,7 +126,7 @@ class RunKMeans(private val spark: SparkSession) {
val pipeline = new Pipeline().setStages(Array(assembler, kmeans))
val kmeansModel = pipeline.fit(data).stages.last.asInstanceOf[KMeansModel]
- kmeansModel.computeCost(assembler.transform(data)) / data.count()
+ kmeansModel.summary.trainingCost
}
def clusteringTake1(data: DataFrame): Unit = {
@@ -161,7 +161,7 @@ class RunKMeans(private val spark: SparkSession) {
val pipelineModel = pipeline.fit(data)
val kmeansModel = pipelineModel.stages.last.asInstanceOf[KMeansModel]
- kmeansModel.computeCost(pipelineModel.transform(data)) / data.count()
+ kmeansModel.summary.trainingCost
}
def clusteringTake2(data: DataFrame): Unit = {
@@ -215,7 +215,7 @@ class RunKMeans(private val spark: SparkSession) {
val pipelineModel = pipeline.fit(data)
val kmeansModel = pipelineModel.stages.last.asInstanceOf[KMeansModel]
- kmeansModel.computeCost(pipelineModel.transform(data)) / data.count()
+ kmeansModel.summary.trainingCost
}
def clusteringTake3(data: DataFrame): Unit = {
diff --git a/ch09-risk/src/main/scala/com/cloudera/datascience/risk/RunRisk.scala b/ch09-risk/src/main/scala/com/cloudera/datascience/risk/RunRisk.scala
index e10a50a3..2c19c6e5 100644
--- a/ch09-risk/src/main/scala/com/cloudera/datascience/risk/RunRisk.scala
+++ b/ch09-risk/src/main/scala/com/cloudera/datascience/risk/RunRisk.scala
@@ -225,7 +225,7 @@ class RunRisk(private val spark: SparkSession) {
val bandwidth = 1.06 * stddev * math.pow(samples.size, -.2)
// Using toList before toArray avoids a Scala bug
- val domain = Range.Double(min, max, (max - min) / 100).toList.toArray
+ val domain = Range.BigDecimal(min, max, (max - min) / 100).map(_.toDouble).toList.toArray
val kd = new KernelDensity().
setSample(samples.toSeq.toDS.rdd).
setBandwidth(bandwidth)
@@ -248,7 +248,7 @@ class RunRisk(private val spark: SparkSession) {
val bandwidth = 1.06 * stddev * math.pow(count, -.2)
// Using toList before toArray avoids a Scala bug
- val domain = Range.Double(min, max, (max - min) / 100).toList.toArray
+ val domain = Range.BigDecimal(min, max, (max - min) / 100).map(_.toDouble).toList.toArray
val kd = new KernelDensity().
setSample(samples.rdd).
setBandwidth(bandwidth)
diff --git a/pom.xml b/pom.xml
index 35daeefd..10880638 100644
--- a/pom.xml
+++ b/pom.xml
@@ -198,7 +198,7 @@
1.8
2.11
${scala.minor.version}.12
- 2.3.3
+ 2.4.4
@@ -416,16 +416,16 @@
- spark-2.4
+ spark-3.0
- 2.4.3
+ 3.0.0-preview
scala-2.12
2.12
- ${scala.minor.version}.8
+ ${scala.minor.version}.10