[SPARK-13013][DOCS] Replace example code in mllib-clustering.md using…

… include_example Replace example code in mllib-clustering.md using include_example https://issues.apache.org/jira/browse/SPARK-13013 The example code in the user guide is embedded in the markdown and hence it is not easy to test. It would be nice to automatically test them. This JIRA is to discuss options to automate example code testing and see what we can do in Spark 1.6. Goal is to move actual example code to spark/examples and test compilation in Jenkins builds. Then in the markdown, we can reference part of the code to show in the user guide. This requires adding a Jekyll tag that is similar to https://github.com/jekyll/jekyll/blob/master/lib/jekyll/tags/include.rb, e.g., called include_example. `{% include_example scala/org/apache/spark/examples/mllib/KMeansExample.scala %}` Jekyll will find `examples/src/main/scala/org/apache/spark/examples/mllib/KMeansExample.scala` and pick code blocks marked "example" and replace code block in `{% highlight %}` in the markdown. See more sub-tasks in parent ticket: https://issues.apache.org/jira/browse/SPARK-11337 Author: Xin Ren <iamshrek@126.com> Closes apache#11116 from keypointt/SPARK-13013.
roygao94 · Mar 22, 2016 · ea45796 · ea45796
1 parent 790c502
commit ea45796
Show file tree

Hide file tree

Showing 16 changed files with 715 additions and 447 deletions.
diff --git a/data/mllib/streaming_kmeans_data_test.txt b/data/mllib/streaming_kmeans_data_test.txt
@@ -0,0 +1,2 @@
+(1.0), [1.7, 0.4, 0.9]
+(2.0), [2.2, 1.8, 0.0]
diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaGaussianMixtureExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaGaussianMixtureExample.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+
+// $example on$
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.clustering.GaussianMixture;
+import org.apache.spark.mllib.clustering.GaussianMixtureModel;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+// $example off$
+
+public class JavaGaussianMixtureExample {
+  public static void main(String[] args) {
+
+    SparkConf conf = new SparkConf().setAppName("JavaGaussianMixtureExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+
+    // $example on$
+    // Load and parse data
+    String path = "data/mllib/gmm_data.txt";
+    JavaRDD<String> data = jsc.textFile(path);
+    JavaRDD<Vector> parsedData = data.map(
+      new Function<String, Vector>() {
+        public Vector call(String s) {
+          String[] sarray = s.trim().split(" ");
+          double[] values = new double[sarray.length];
+          for (int i = 0; i < sarray.length; i++)
+            values[i] = Double.parseDouble(sarray[i]);
+          return Vectors.dense(values);
+        }
+      }
+    );
+    parsedData.cache();
+
+    // Cluster the data into two classes using GaussianMixture
+    GaussianMixtureModel gmm = new GaussianMixture().setK(2).run(parsedData.rdd());
+
+    // Save and load GaussianMixtureModel
+    gmm.save(jsc.sc(), "target/org/apache/spark/JavaGaussianMixtureExample/GaussianMixtureModel");
+    GaussianMixtureModel sameModel = GaussianMixtureModel.load(jsc.sc(),
+      "target/org.apache.spark.JavaGaussianMixtureExample/GaussianMixtureModel");
+
+    // Output the parameters of the mixture model
+    for (int j = 0; j < gmm.k(); j++) {
+      System.out.printf("weight=%f\nmu=%s\nsigma=\n%s\n",
+        gmm.weights()[j], gmm.gaussians()[j].mu(), gmm.gaussians()[j].sigma());
+    }
+    // $example off$
+
+    jsc.stop();
+  }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKMeansExample.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+
+// $example on$
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.clustering.KMeans;
+import org.apache.spark.mllib.clustering.KMeansModel;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+// $example off$
+
+public class JavaKMeansExample {
+  public static void main(String[] args) {
+
+    SparkConf conf = new SparkConf().setAppName("JavaKMeansExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+
+    // $example on$
+    // Load and parse data
+    String path = "data/mllib/kmeans_data.txt";
+    JavaRDD<String> data = jsc.textFile(path);
+    JavaRDD<Vector> parsedData = data.map(
+      new Function<String, Vector>() {
+        public Vector call(String s) {
+          String[] sarray = s.split(" ");
+          double[] values = new double[sarray.length];
+          for (int i = 0; i < sarray.length; i++)
+            values[i] = Double.parseDouble(sarray[i]);
+          return Vectors.dense(values);
+        }
+      }
+    );
+    parsedData.cache();
+
+    // Cluster the data into two classes using KMeans
+    int numClusters = 2;
+    int numIterations = 20;
+    KMeansModel clusters = KMeans.train(parsedData.rdd(), numClusters, numIterations);
+
+    // Evaluate clustering by computing Within Set Sum of Squared Errors
+    double WSSSE = clusters.computeCost(parsedData.rdd());
+    System.out.println("Within Set Sum of Squared Errors = " + WSSSE);
+
+    // Save and load model
+    clusters.save(jsc.sc(), "target/org/apache/spark/JavaKMeansExample/KMeansModel");
+    KMeansModel sameModel = KMeansModel.load(jsc.sc(),
+      "target/org/apache/spark/JavaKMeansExample/KMeansModel");
+    // $example off$
+
+    jsc.stop();
+  }
+}
diff --git a/...s/src/main/java/org/apache/spark/examples/mllib/JavaLatentDirichletAllocationExample.java b/...s/src/main/java/org/apache/spark/examples/mllib/JavaLatentDirichletAllocationExample.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+
+// $example on$
+import scala.Tuple2;
+
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.clustering.DistributedLDAModel;
+import org.apache.spark.mllib.clustering.LDA;
+import org.apache.spark.mllib.clustering.LDAModel;
+import org.apache.spark.mllib.linalg.Matrix;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+// $example off$
+
+public class JavaLatentDirichletAllocationExample {
+  public static void main(String[] args) {
+
+    SparkConf conf = new SparkConf().setAppName("JavaKLatentDirichletAllocationExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+
+    // $example on$
+    // Load and parse the data
+    String path = "data/mllib/sample_lda_data.txt";
+    JavaRDD<String> data = jsc.textFile(path);
+    JavaRDD<Vector> parsedData = data.map(
+      new Function<String, Vector>() {
+        public Vector call(String s) {
+          String[] sarray = s.trim().split(" ");
+          double[] values = new double[sarray.length];
+          for (int i = 0; i < sarray.length; i++)
+            values[i] = Double.parseDouble(sarray[i]);
+          return Vectors.dense(values);
+        }
+      }
+    );
+    // Index documents with unique IDs
+    JavaPairRDD<Long, Vector> corpus =
+      JavaPairRDD.fromJavaRDD(parsedData.zipWithIndex().map(
+        new Function<Tuple2<Vector, Long>, Tuple2<Long, Vector>>() {
+          public Tuple2<Long, Vector> call(Tuple2<Vector, Long> doc_id) {
+            return doc_id.swap();
+          }
+        }
+      )
+    );
+    corpus.cache();
+
+    // Cluster the documents into three topics using LDA
+    LDAModel ldaModel = new LDA().setK(3).run(corpus);
+
+    // Output topics. Each is a distribution over words (matching word count vectors)
+    System.out.println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize()
+      + " words):");
+    Matrix topics = ldaModel.topicsMatrix();
+    for (int topic = 0; topic < 3; topic++) {
+      System.out.print("Topic " + topic + ":");
+      for (int word = 0; word < ldaModel.vocabSize(); word++) {
+        System.out.print(" " + topics.apply(word, topic));
+      }
+      System.out.println();
+    }
+
+    ldaModel.save(jsc.sc(),
+      "target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel");
+    DistributedLDAModel sameModel = DistributedLDAModel.load(jsc.sc(),
+      "target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel");
+    // $example off$
+
+    jsc.stop();
+  }
+}
diff --git a/...es/src/main/java/org/apache/spark/examples/mllib/JavaPowerIterationClusteringExample.java b/...es/src/main/java/org/apache/spark/examples/mllib/JavaPowerIterationClusteringExample.java
@@ -24,8 +24,10 @@
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+// $example on$
 import org.apache.spark.mllib.clustering.PowerIterationClustering;
 import org.apache.spark.mllib.clustering.PowerIterationClusteringModel;
+// $example off$
 
 /**
  * Java example for graph clustering using power iteration clustering (PIC).
@@ -36,6 +38,7 @@ public static void main(String[] args) {
     JavaSparkContext sc = new JavaSparkContext(sparkConf);
 
     @SuppressWarnings("unchecked")
+    // $example on$
     JavaRDD<Tuple3<Long, Long, Double>> similarities = sc.parallelize(Lists.newArrayList(
       new Tuple3<Long, Long, Double>(0L, 1L, 0.9),
       new Tuple3<Long, Long, Double>(1L, 2L, 0.9),
@@ -51,6 +54,7 @@ public static void main(String[] args) {
     for (PowerIterationClustering.Assignment a: model.assignments().toJavaRDD().collect()) {
       System.out.println(a.id() + " -> " + a.cluster());
     }
+    // $example off$
 
     sc.stop();
   }

diff --git a/examples/src/main/python/mllib/gaussian_mixture_example.py b/examples/src/main/python/mllib/gaussian_mixture_example.py
@@ -0,0 +1,51 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+# $example on$
+from numpy import array
+# $example off$
+
+from pyspark import SparkContext
+# $example on$
+from pyspark.mllib.clustering import GaussianMixture, GaussianMixtureModel
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="GaussianMixtureExample")  # SparkContext
+
+    # $example on$
+    # Load and parse the data
+    data = sc.textFile("data/mllib/gmm_data.txt")
+    parsedData = data.map(lambda line: array([float(x) for x in line.strip().split(' ')]))
+
+    # Build the model (cluster the data)
+    gmm = GaussianMixture.train(parsedData, 2)
+
+    # Save and load model
+    gmm.save(sc, "target/org/apache/spark/PythonGaussianMixtureExample/GaussianMixtureModel")
+    sameModel = GaussianMixtureModel\
+        .load(sc, "target/org/apache/spark/PythonGaussianMixtureExample/GaussianMixtureModel")
+
+    # output parameters of model
+    for i in range(2):
+        print("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu,
+              "sigma = ", gmm.gaussians[i].sigma.toArray())
+    # $example off$
+
+    sc.stop()
diff --git a/examples/src/main/python/mllib/k_means_example.py b/examples/src/main/python/mllib/k_means_example.py
@@ -0,0 +1,55 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+# $example on$
+from numpy import array
+from math import sqrt
+# $example off$
+
+from pyspark import SparkContext
+# $example on$
+from pyspark.mllib.clustering import KMeans, KMeansModel
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="KMeansExample")  # SparkContext
+
+    # $example on$
+    # Load and parse the data
+    data = sc.textFile("data/mllib/kmeans_data.txt")
+    parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
+
+    # Build the model (cluster the data)
+    clusters = KMeans.train(parsedData, 2, maxIterations=10,
+                            runs=10, initializationMode="random")
+
+    # Evaluate clustering by computing Within Set Sum of Squared Errors
+    def error(point):
+        center = clusters.centers[clusters.predict(point)]
+        return sqrt(sum([x**2 for x in (point - center)]))
+
+    WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
+    print("Within Set Sum of Squared Error = " + str(WSSSE))
+
+    # Save and load model
+    clusters.save(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel")
+    sameModel = KMeansModel.load(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel")
+    # $example off$
+
+    sc.stop()