In [0]:
#"kmeans algorithm for clustering"

#k-means is one of the most commonly used clustering algorithms that clusters the data points into a predefined number of clusters. The MLlib implementation includes a parallelized variant of the k-means++ method called kmeans||.

#KMeans is implemented as an Estimator and generates a KMeansModel as the base model.

In [0]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [0]:
# Loads data.
dataset = spark.read.format("libsvm").load("dbfs:/FileStore/shared_uploads/salomi0030@gmail.com/sample_kmeans_data-1.txt")

#dataset=dbfs:/FileStore/shared_uploads/salomi0030@gmail.com/samp1.txt
dataset

Out[39]: DataFrame[label: double, features: vector]

In [0]:
# Trains a k-means model.
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(dataset)


In [0]:
# Make predictions
predictions = model.transform(dataset)

In [0]:
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))


Silhouette with squared euclidean distance = 0.9997530305375207


In [0]:
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[9.1 9.1 9.1]
[0.1 0.1 0.1]


In [0]:
#Latent Dirichlet allocation (LDA)

#LDA is implemented as an Estimator that supports both EMLDAOptimizer and OnlineLDAOptimizer, and generates a LDAModel as the base model. Expert users may cast a LDAModel generated by EMLDAOptimizer to a DistributedLDAModel if needed.

In [0]:
from pyspark.ml.clustering import LDA

In [0]:
# Loads data.
dataset = spark.read.format("libsvm").load("dbfs:/FileStore/shared_uploads/salomi0030@gmail.com/sample_lda_libsvm_data.txt")

In [0]:
# Training LDA model.
lda = LDA(k=10, maxIter=10)
model = lda.fit(dataset)

ll = model.logLikelihood(dataset)
lp = model.logPerplexity(dataset)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))


The lower bound on the log likelihood of the entire corpus: -797.5200544004555
The upper bound on perplexity: 3.0673848246171365


In [0]:
# Describe topics.
topics = model.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)


The topics described by their top-weighted terms:
+-----+-----------+---------------------------------------------------------------+
|topic|termIndices|termWeights                                                    |
+-----+-----------+---------------------------------------------------------------+
|0    |[1, 3, 4]  |[0.10368082673401043, 0.10246920001021176, 0.09945342991888821]|
|1    |[0, 5, 9]  |[0.1076176051626867, 0.09801051465962088, 0.09705006627909334] |
|2    |[5, 10, 9] |[0.09817190617101527, 0.0981118296756393, 0.09564406780739915] |
|3    |[5, 10, 2] |[0.10428733568856435, 0.10200642097504846, 0.09787247160826047]|
|4    |[5, 8, 2]  |[0.10610350651837704, 0.10225089640708551, 0.0969920925809682] |
|5    |[2, 1, 5]  |[0.1017814308587037, 0.09673789329662605, 0.09602700830858574] |
|6    |[3, 5, 4]  |[0.1616436533169385, 0.1391919780709568, 0.11423150148553422]  |
|7    |[8, 3, 5]  |[0.10449091906046457, 0.09702934195910436, 0.09685753582283695]|
|8    |[2, 10, 5] |[0.2048

In [0]:
# Shows the result
transformed = model.transform(dataset)
transformed.show(truncate=False)

+-----+---------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                       |topicDistribution                                                                                                                                                                                                     |
+-----+---------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0  |(11,[0,1,2,4,5,6,7,10],[1.0,2.0,6.0,2.0,3.0,1.0,1.0,3.0])      |[0.004676952344250045,0.004676968395495595,0.004676950578410848,0.00