forked from apache/spark
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SPARK-13013][DOCS] Replace example code in mllib-clustering.md using…
… include_example Replace example code in mllib-clustering.md using include_example https://issues.apache.org/jira/browse/SPARK-13013 The example code in the user guide is embedded in the markdown and hence it is not easy to test. It would be nice to automatically test them. This JIRA is to discuss options to automate example code testing and see what we can do in Spark 1.6. Goal is to move actual example code to spark/examples and test compilation in Jenkins builds. Then in the markdown, we can reference part of the code to show in the user guide. This requires adding a Jekyll tag that is similar to https://github.com/jekyll/jekyll/blob/master/lib/jekyll/tags/include.rb, e.g., called include_example. `{% include_example scala/org/apache/spark/examples/mllib/KMeansExample.scala %}` Jekyll will find `examples/src/main/scala/org/apache/spark/examples/mllib/KMeansExample.scala` and pick code blocks marked "example" and replace code block in `{% highlight %}` in the markdown. See more sub-tasks in parent ticket: https://issues.apache.org/jira/browse/SPARK-11337 Author: Xin Ren <iamshrek@126.com> Closes apache#11116 from keypointt/SPARK-13013.
- Loading branch information
Showing
16 changed files
with
715 additions
and
447 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
(1.0), [1.7, 0.4, 0.9] | ||
(2.0), [2.2, 1.8, 0.0] |
Large diffs are not rendered by default.
Oops, something went wrong.
72 changes: 72 additions & 0 deletions
72
examples/src/main/java/org/apache/spark/examples/mllib/JavaGaussianMixtureExample.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.examples.mllib; | ||
|
||
import org.apache.spark.SparkConf; | ||
import org.apache.spark.api.java.JavaSparkContext; | ||
|
||
// $example on$ | ||
import org.apache.spark.api.java.JavaRDD; | ||
import org.apache.spark.api.java.function.Function; | ||
import org.apache.spark.mllib.clustering.GaussianMixture; | ||
import org.apache.spark.mllib.clustering.GaussianMixtureModel; | ||
import org.apache.spark.mllib.linalg.Vector; | ||
import org.apache.spark.mllib.linalg.Vectors; | ||
// $example off$ | ||
|
||
public class JavaGaussianMixtureExample { | ||
public static void main(String[] args) { | ||
|
||
SparkConf conf = new SparkConf().setAppName("JavaGaussianMixtureExample"); | ||
JavaSparkContext jsc = new JavaSparkContext(conf); | ||
|
||
// $example on$ | ||
// Load and parse data | ||
String path = "data/mllib/gmm_data.txt"; | ||
JavaRDD<String> data = jsc.textFile(path); | ||
JavaRDD<Vector> parsedData = data.map( | ||
new Function<String, Vector>() { | ||
public Vector call(String s) { | ||
String[] sarray = s.trim().split(" "); | ||
double[] values = new double[sarray.length]; | ||
for (int i = 0; i < sarray.length; i++) | ||
values[i] = Double.parseDouble(sarray[i]); | ||
return Vectors.dense(values); | ||
} | ||
} | ||
); | ||
parsedData.cache(); | ||
|
||
// Cluster the data into two classes using GaussianMixture | ||
GaussianMixtureModel gmm = new GaussianMixture().setK(2).run(parsedData.rdd()); | ||
|
||
// Save and load GaussianMixtureModel | ||
gmm.save(jsc.sc(), "target/org/apache/spark/JavaGaussianMixtureExample/GaussianMixtureModel"); | ||
GaussianMixtureModel sameModel = GaussianMixtureModel.load(jsc.sc(), | ||
"target/org.apache.spark.JavaGaussianMixtureExample/GaussianMixtureModel"); | ||
|
||
// Output the parameters of the mixture model | ||
for (int j = 0; j < gmm.k(); j++) { | ||
System.out.printf("weight=%f\nmu=%s\nsigma=\n%s\n", | ||
gmm.weights()[j], gmm.gaussians()[j].mu(), gmm.gaussians()[j].sigma()); | ||
} | ||
// $example off$ | ||
|
||
jsc.stop(); | ||
} | ||
} |
72 changes: 72 additions & 0 deletions
72
examples/src/main/java/org/apache/spark/examples/mllib/JavaKMeansExample.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.examples.mllib; | ||
|
||
import org.apache.spark.SparkConf; | ||
import org.apache.spark.api.java.JavaSparkContext; | ||
|
||
// $example on$ | ||
import org.apache.spark.api.java.JavaRDD; | ||
import org.apache.spark.api.java.function.Function; | ||
import org.apache.spark.mllib.clustering.KMeans; | ||
import org.apache.spark.mllib.clustering.KMeansModel; | ||
import org.apache.spark.mllib.linalg.Vector; | ||
import org.apache.spark.mllib.linalg.Vectors; | ||
// $example off$ | ||
|
||
public class JavaKMeansExample { | ||
public static void main(String[] args) { | ||
|
||
SparkConf conf = new SparkConf().setAppName("JavaKMeansExample"); | ||
JavaSparkContext jsc = new JavaSparkContext(conf); | ||
|
||
// $example on$ | ||
// Load and parse data | ||
String path = "data/mllib/kmeans_data.txt"; | ||
JavaRDD<String> data = jsc.textFile(path); | ||
JavaRDD<Vector> parsedData = data.map( | ||
new Function<String, Vector>() { | ||
public Vector call(String s) { | ||
String[] sarray = s.split(" "); | ||
double[] values = new double[sarray.length]; | ||
for (int i = 0; i < sarray.length; i++) | ||
values[i] = Double.parseDouble(sarray[i]); | ||
return Vectors.dense(values); | ||
} | ||
} | ||
); | ||
parsedData.cache(); | ||
|
||
// Cluster the data into two classes using KMeans | ||
int numClusters = 2; | ||
int numIterations = 20; | ||
KMeansModel clusters = KMeans.train(parsedData.rdd(), numClusters, numIterations); | ||
|
||
// Evaluate clustering by computing Within Set Sum of Squared Errors | ||
double WSSSE = clusters.computeCost(parsedData.rdd()); | ||
System.out.println("Within Set Sum of Squared Errors = " + WSSSE); | ||
|
||
// Save and load model | ||
clusters.save(jsc.sc(), "target/org/apache/spark/JavaKMeansExample/KMeansModel"); | ||
KMeansModel sameModel = KMeansModel.load(jsc.sc(), | ||
"target/org/apache/spark/JavaKMeansExample/KMeansModel"); | ||
// $example off$ | ||
|
||
jsc.stop(); | ||
} | ||
} |
93 changes: 93 additions & 0 deletions
93
...s/src/main/java/org/apache/spark/examples/mllib/JavaLatentDirichletAllocationExample.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.examples.mllib; | ||
|
||
import org.apache.spark.SparkConf; | ||
import org.apache.spark.api.java.JavaSparkContext; | ||
|
||
// $example on$ | ||
import scala.Tuple2; | ||
|
||
import org.apache.spark.api.java.JavaPairRDD; | ||
import org.apache.spark.api.java.JavaRDD; | ||
import org.apache.spark.api.java.function.Function; | ||
import org.apache.spark.mllib.clustering.DistributedLDAModel; | ||
import org.apache.spark.mllib.clustering.LDA; | ||
import org.apache.spark.mllib.clustering.LDAModel; | ||
import org.apache.spark.mllib.linalg.Matrix; | ||
import org.apache.spark.mllib.linalg.Vector; | ||
import org.apache.spark.mllib.linalg.Vectors; | ||
// $example off$ | ||
|
||
public class JavaLatentDirichletAllocationExample { | ||
public static void main(String[] args) { | ||
|
||
SparkConf conf = new SparkConf().setAppName("JavaKLatentDirichletAllocationExample"); | ||
JavaSparkContext jsc = new JavaSparkContext(conf); | ||
|
||
// $example on$ | ||
// Load and parse the data | ||
String path = "data/mllib/sample_lda_data.txt"; | ||
JavaRDD<String> data = jsc.textFile(path); | ||
JavaRDD<Vector> parsedData = data.map( | ||
new Function<String, Vector>() { | ||
public Vector call(String s) { | ||
String[] sarray = s.trim().split(" "); | ||
double[] values = new double[sarray.length]; | ||
for (int i = 0; i < sarray.length; i++) | ||
values[i] = Double.parseDouble(sarray[i]); | ||
return Vectors.dense(values); | ||
} | ||
} | ||
); | ||
// Index documents with unique IDs | ||
JavaPairRDD<Long, Vector> corpus = | ||
JavaPairRDD.fromJavaRDD(parsedData.zipWithIndex().map( | ||
new Function<Tuple2<Vector, Long>, Tuple2<Long, Vector>>() { | ||
public Tuple2<Long, Vector> call(Tuple2<Vector, Long> doc_id) { | ||
return doc_id.swap(); | ||
} | ||
} | ||
) | ||
); | ||
corpus.cache(); | ||
|
||
// Cluster the documents into three topics using LDA | ||
LDAModel ldaModel = new LDA().setK(3).run(corpus); | ||
|
||
// Output topics. Each is a distribution over words (matching word count vectors) | ||
System.out.println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize() | ||
+ " words):"); | ||
Matrix topics = ldaModel.topicsMatrix(); | ||
for (int topic = 0; topic < 3; topic++) { | ||
System.out.print("Topic " + topic + ":"); | ||
for (int word = 0; word < ldaModel.vocabSize(); word++) { | ||
System.out.print(" " + topics.apply(word, topic)); | ||
} | ||
System.out.println(); | ||
} | ||
|
||
ldaModel.save(jsc.sc(), | ||
"target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel"); | ||
DistributedLDAModel sameModel = DistributedLDAModel.load(jsc.sc(), | ||
"target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel"); | ||
// $example off$ | ||
|
||
jsc.stop(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
51 changes: 51 additions & 0 deletions
51
examples/src/main/python/mllib/gaussian_mixture_example.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
# | ||
# Licensed to the Apache Software Foundation (ASF) under one or more | ||
# contributor license agreements. See the NOTICE file distributed with | ||
# this work for additional information regarding copyright ownership. | ||
# The ASF licenses this file to You under the Apache License, Version 2.0 | ||
# (the "License"); you may not use this file except in compliance with | ||
# the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
from __future__ import print_function | ||
|
||
# $example on$ | ||
from numpy import array | ||
# $example off$ | ||
|
||
from pyspark import SparkContext | ||
# $example on$ | ||
from pyspark.mllib.clustering import GaussianMixture, GaussianMixtureModel | ||
# $example off$ | ||
|
||
if __name__ == "__main__": | ||
sc = SparkContext(appName="GaussianMixtureExample") # SparkContext | ||
|
||
# $example on$ | ||
# Load and parse the data | ||
data = sc.textFile("data/mllib/gmm_data.txt") | ||
parsedData = data.map(lambda line: array([float(x) for x in line.strip().split(' ')])) | ||
|
||
# Build the model (cluster the data) | ||
gmm = GaussianMixture.train(parsedData, 2) | ||
|
||
# Save and load model | ||
gmm.save(sc, "target/org/apache/spark/PythonGaussianMixtureExample/GaussianMixtureModel") | ||
sameModel = GaussianMixtureModel\ | ||
.load(sc, "target/org/apache/spark/PythonGaussianMixtureExample/GaussianMixtureModel") | ||
|
||
# output parameters of model | ||
for i in range(2): | ||
print("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu, | ||
"sigma = ", gmm.gaussians[i].sigma.toArray()) | ||
# $example off$ | ||
|
||
sc.stop() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
# | ||
# Licensed to the Apache Software Foundation (ASF) under one or more | ||
# contributor license agreements. See the NOTICE file distributed with | ||
# this work for additional information regarding copyright ownership. | ||
# The ASF licenses this file to You under the Apache License, Version 2.0 | ||
# (the "License"); you may not use this file except in compliance with | ||
# the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
from __future__ import print_function | ||
|
||
# $example on$ | ||
from numpy import array | ||
from math import sqrt | ||
# $example off$ | ||
|
||
from pyspark import SparkContext | ||
# $example on$ | ||
from pyspark.mllib.clustering import KMeans, KMeansModel | ||
# $example off$ | ||
|
||
if __name__ == "__main__": | ||
sc = SparkContext(appName="KMeansExample") # SparkContext | ||
|
||
# $example on$ | ||
# Load and parse the data | ||
data = sc.textFile("data/mllib/kmeans_data.txt") | ||
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) | ||
|
||
# Build the model (cluster the data) | ||
clusters = KMeans.train(parsedData, 2, maxIterations=10, | ||
runs=10, initializationMode="random") | ||
|
||
# Evaluate clustering by computing Within Set Sum of Squared Errors | ||
def error(point): | ||
center = clusters.centers[clusters.predict(point)] | ||
return sqrt(sum([x**2 for x in (point - center)])) | ||
|
||
WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) | ||
print("Within Set Sum of Squared Error = " + str(WSSSE)) | ||
|
||
# Save and load model | ||
clusters.save(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel") | ||
sameModel = KMeansModel.load(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel") | ||
# $example off$ | ||
|
||
sc.stop() |
Oops, something went wrong.