diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 77c53032b7c8b..d89b784b6ad91 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -71,13 +71,15 @@ private[python] class PythonMLLibAPI extends Serializable { minPartitions: Int): JavaRDD[LabeledPoint] = MLUtils.loadLabeledPoints(jsc.sc, path, minPartitions) - def appendBias(data: org.apache.spark.mllib.linalg.Vector) - : org.apache.spark.mllib.linalg.Vector - = MLUtils.appendBias(data) - - def loadVectors(jsc: JavaSparkContext, path: String) - : RDD[org.apache.spark.mllib.linalg.Vector] - = MLUtils.loadVectors(jsc.sc, path) + /** + * Loads and serializes vectors saved with `RDD#saveAsTextFile`. + * @param jsc Java SparkContext + * @param path file or directory path in any Hadoop-supported file system URI + * @return serialized vectors in a RDD + */ + def loadVectors(jsc: JavaSparkContext, + path: String): RDD[Vector] = + MLUtils.loadVectors(jsc.sc, path) private def trainRegressionModel( learner: GeneralizedLinearAlgorithm[_ <: GeneralizedLinearModel], diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index b1ed263ba9a9c..46a1a2645e6b0 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -821,7 +821,7 @@ def test_model_transform(self): class MLUtilsTests(MLlibTestCase): def test_append_bias(self): - data = [1.0, 2.0, 3.0] + data = [2.0, 2.0, 2.0] ret = MLUtils.appendBias(data) self.assertEqual(ret[3], 1.0) @@ -832,14 +832,17 @@ def test_load_vectors(self): [1.0, 2.0, 3.0] ] try: - self.sc.parallelize(data).saveAsTextFile("test_load_vectors") - ret_rdd = MLUtils.loadVectors(self.sc, "test_load_vectors") + temp_dir = tempfile.mkdtemp() + load_vectors_path = os.path.join(temp_dir, "test_load_vectors") + self.sc.parallelize(data).saveAsTextFile(load_vectors_path) + ret_rdd = MLUtils.loadVectors(self.sc, load_vectors_path) ret = ret_rdd.collect() + ret.sort() self.assertEqual(len(ret), 2) self.assertEqual(ret[0], DenseVector([1.0, 2.0, 3.0])) self.assertEqual(ret[1], DenseVector([1.0, 2.0, 3.0])) finally: - shutil.rmtree("test_load_vectors") + shutil.rmtree(load_vectors_path) if __name__ == "__main__": diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py index 4a1e069c6dbff..41a4e4628cf71 100644 --- a/python/pyspark/mllib/util.py +++ b/python/pyspark/mllib/util.py @@ -171,10 +171,18 @@ def loadLabeledPoints(sc, path, minPartitions=None): @staticmethod def appendBias(data): - return callMLlibFunc("appendBias", _convert_to_vector(data)) + """ + Returns a new vector with `1.0` (bias) appended to the input vector. + """ + vec = _convert_to_vector(data) + return np.append(vec, 1.0) @staticmethod def loadVectors(sc, path): + """ + Loads vectors saved using `RDD[Vector].saveAsTextFile` + with the default number of partitions. + """ return callMLlibFunc("loadVectors", sc, path)