diff --git a/.travis.yml b/.travis.yml index afac7b8f..5de6600f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,10 +5,10 @@ script: mvn ${SPARK} -Dmaven.javadoc.skip=true -B clean compile jdk: openjdk8 matrix: include: - # Covers Spark 2.3.x + # Covers Spark 2.4.x + Scala 2.11 - env: SPARK= - # Covers Spark 2.4.x + Scala 2.12 - - env: SPARK="-Pspark-2.4 -Pscala-2.12" + # Covers Spark 3.0.x + Scala 2.12 + - env: SPARK="-Pspark-3.0 -Pscala-2.12" cache: directories: - $HOME/.m2 diff --git a/ch05-kmeans/src/main/scala/com/cloudera/datascience/kmeans/RunKMeans.scala b/ch05-kmeans/src/main/scala/com/cloudera/datascience/kmeans/RunKMeans.scala index f29afa7d..0f38ddf6 100644 --- a/ch05-kmeans/src/main/scala/com/cloudera/datascience/kmeans/RunKMeans.scala +++ b/ch05-kmeans/src/main/scala/com/cloudera/datascience/kmeans/RunKMeans.scala @@ -107,7 +107,7 @@ class RunKMeans(private val spark: SparkSession) { val pipeline = new Pipeline().setStages(Array(assembler, kmeans)) val kmeansModel = pipeline.fit(data).stages.last.asInstanceOf[KMeansModel] - kmeansModel.computeCost(assembler.transform(data)) / data.count() + kmeansModel.summary.trainingCost } def clusteringScore1(data: DataFrame, k: Int): Double = { @@ -126,7 +126,7 @@ class RunKMeans(private val spark: SparkSession) { val pipeline = new Pipeline().setStages(Array(assembler, kmeans)) val kmeansModel = pipeline.fit(data).stages.last.asInstanceOf[KMeansModel] - kmeansModel.computeCost(assembler.transform(data)) / data.count() + kmeansModel.summary.trainingCost } def clusteringTake1(data: DataFrame): Unit = { @@ -161,7 +161,7 @@ class RunKMeans(private val spark: SparkSession) { val pipelineModel = pipeline.fit(data) val kmeansModel = pipelineModel.stages.last.asInstanceOf[KMeansModel] - kmeansModel.computeCost(pipelineModel.transform(data)) / data.count() + kmeansModel.summary.trainingCost } def clusteringTake2(data: DataFrame): Unit = { @@ -215,7 +215,7 @@ class RunKMeans(private val spark: SparkSession) { val pipelineModel = pipeline.fit(data) val kmeansModel = pipelineModel.stages.last.asInstanceOf[KMeansModel] - kmeansModel.computeCost(pipelineModel.transform(data)) / data.count() + kmeansModel.summary.trainingCost } def clusteringTake3(data: DataFrame): Unit = { diff --git a/ch09-risk/src/main/scala/com/cloudera/datascience/risk/RunRisk.scala b/ch09-risk/src/main/scala/com/cloudera/datascience/risk/RunRisk.scala index e10a50a3..2c19c6e5 100644 --- a/ch09-risk/src/main/scala/com/cloudera/datascience/risk/RunRisk.scala +++ b/ch09-risk/src/main/scala/com/cloudera/datascience/risk/RunRisk.scala @@ -225,7 +225,7 @@ class RunRisk(private val spark: SparkSession) { val bandwidth = 1.06 * stddev * math.pow(samples.size, -.2) // Using toList before toArray avoids a Scala bug - val domain = Range.Double(min, max, (max - min) / 100).toList.toArray + val domain = Range.BigDecimal(min, max, (max - min) / 100).map(_.toDouble).toList.toArray val kd = new KernelDensity(). setSample(samples.toSeq.toDS.rdd). setBandwidth(bandwidth) @@ -248,7 +248,7 @@ class RunRisk(private val spark: SparkSession) { val bandwidth = 1.06 * stddev * math.pow(count, -.2) // Using toList before toArray avoids a Scala bug - val domain = Range.Double(min, max, (max - min) / 100).toList.toArray + val domain = Range.BigDecimal(min, max, (max - min) / 100).map(_.toDouble).toList.toArray val kd = new KernelDensity(). setSample(samples.rdd). setBandwidth(bandwidth) diff --git a/pom.xml b/pom.xml index 35daeefd..10880638 100644 --- a/pom.xml +++ b/pom.xml @@ -198,7 +198,7 @@ 1.8 2.11 ${scala.minor.version}.12 - 2.3.3 + 2.4.4 @@ -416,16 +416,16 @@ - spark-2.4 + spark-3.0 - 2.4.3 + 3.0.0-preview scala-2.12 2.12 - ${scala.minor.version}.8 + ${scala.minor.version}.10