In [0]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [0]:
dataset = spark.read.format("libsvm").load("/FileStore/tables/sample_kmeans_data.txt")
dataset

Out[3]: DataFrame[label: double, features: vector]

In [0]:
kmeans= KMeans().setK(2).setSeed(1)
model=kmeans.fit(dataset)

In [0]:
predictions = model.transform(dataset)

In [0]:
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))


Silhouette with squared euclidean distance = 0.9997530305375207


In [0]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)
    

Cluster Centers: 
[9.1 9.1 9.1]
[0.1 0.1 0.1]


In [0]:
#LDA is implemented as an Estimator that supports both EMLDAOptimizer and OnlineLDAOptimizer, and generates a LDAModel as the base model. Expert users may cast a LDAModel generated by EMLDAOptimizer to a DistributedLDAModel if needed.

In [0]:
from pyspark.ml.clustering import LDA

In [0]:
dataset=spark.read.format("libsvm").load("dbfs:/FileStore/shared_uploads/salomi0030@gmail.com/sample_lda_libsvm_data.txt")

In [0]:
lda = LDA(k=10, maxIter=10)
model = lda.fit(dataset)

ll = model.logLikelihood(dataset)
lp = model.logPerplexity(dataset)

print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))


In [0]:
topics = model.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

In [0]:
transformed = model.transform(dataset)
transformed.show(truncate=False)